1 ;;; dat/html.lisp --- HTML parser
3 ;; see https://github.com/rotatef/cl-html5-parser
5 ;; spec: https://html.spec.whatwg.org/
9 ;; HTML is usually associated with XML, but not all HTML is valid
12 ;; This package provides a pretty good HTML parser, with the default
13 ;; DOM being the one used in our XML package DAT/XML.
18 (in-package :dat/html)
20 (deftype array-length ()
21 "Type of an array index."
22 '(integer 0 #.array-dimension-limit))
25 "Type of the input stream buffer."
26 '(vector character *))
28 (defparameter *default-encoding* :utf-8)
30 (defclass html-input-stream ()
31 ((source :initarg :source)
32 (encoding :reader html5-stream-encoding)
33 (char-stream :initform nil)
37 (errors :initform nil :accessor html5-stream-errors)))
39 (defun make-html-input-stream (source &key override-encoding fallback-encoding)
40 (when (stringp source)
41 ;; Encoding is not relevant when input is a string,
42 ;; but we set it utf-8 here to avoid auto detecting taking place.
43 (setf override-encoding :utf-8))
44 (let ((self (make-instance 'html-input-stream :source source)))
45 (with-slots (encoding stream) self
46 (setf encoding (detect-encoding self
47 (find-encoding override-encoding)
48 (find-encoding fallback-encoding)))
49 (open-char-stream self))
52 ;; 12.2.2.2 Character encodings
53 (defun find-encoding (encoding-name)
54 ;; Normalize the string designator
55 (setf encoding-name (string-upcase (substitute #\- #\_ (string-trim +space-characters+ (string encoding-name)))))
56 ;; All known encoding will already be interned in the keyword package so find-symbol is fine here
57 (setf encoding-name (find-symbol encoding-name :keyword))
60 ;; Verfiy that flexi-streams knows the encoding and resolve aliases
61 (case (flex:external-format-name (flex:make-external-format encoding-name))
62 ;; Some encoding should be replaced by some other.
63 ;; Only those supported by flexi-streams are listed here.
64 ;; iso-8859-11 should be replaced by windows-874, but flexi-streams doesn't that encoding.
65 (:iso-8859-1 :windows-1252)
66 (:iso-8859-9 :windows-1254)
67 (:us-ascii :windows-1252)
68 (otherwise encoding-name))
69 (flex:external-format-error ())))
71 ;; 12.2.2.1 Determining the character encoding
72 (defun detect-encoding (stream override-encoding fallback-encoding)
73 (with-slots (encoding) stream
75 ;; 1. and 2. encoding overridden by user or transport layer
76 (when override-encoding
77 (return (cons override-encoding :certain)))
79 ;; 3. wait for 1024 bytes, not implemented
82 (let ((bom-encoding (detect-bom stream)))
84 (return (cons bom-encoding :certain))))
86 ;; 5. Prescan not implemented
88 ;; 6. Use fallback encoding
89 (when fallback-encoding
90 (return (cons encoding :tentative)))
92 ;; 7. Autodect not implemented
94 ;; 8. Implementation-defined default
95 (return (cons *default-encoding* :tentative)))))
97 (defmacro handle-encoding-errors (stream &body body)
98 `(handler-bind ((flex:external-format-encoding-error
101 (push :invalid-codepoint (html5-stream-errors ,stream))
102 (use-value #\uFFFD))))
105 (defun open-char-stream (self)
106 (with-slots (source encoding char-stream chunk chunk-offset pending-cr) self
107 (setf chunk (make-array (* 10 1024) :element-type 'character :fill-pointer 0))
108 (setf chunk-offset 0)
109 (setf pending-cr nil)
114 (make-string-input-stream source)
115 (flex:make-flexi-stream
118 (open source :element-type '(unsigned-byte 8)))
122 (flex:make-in-memory-input-stream source)))
123 :external-format (flex:make-external-format (car encoding) :eol-style :lf))))
124 ;; 12.2.2.4 says we should always skip the first byte order mark
125 (handle-encoding-errors self
126 (let ((first-char (peek-char nil char-stream nil)))
127 (when (eql first-char #\ufeff)
128 (read-char char-stream))))))
130 (defun detect-bom (self)
131 (with-slots (source) self
132 (let (byte-0 byte-1 byte-2)
135 (when (> (length source) 0) (setf byte-0 (aref source 0)))
136 (when (> (length source) 1) (setf byte-1 (aref source 1)))
137 (when (> (length source) 2) (setf byte-2 (aref source 2))))
139 (with-open-file (in source :element-type '(unsigned-byte 8))
140 (setf byte-0 (read-byte in nil))
141 (setf byte-1 (read-byte in nil))
142 (setf byte-2 (read-byte in nil))))
144 (error "Can't detect encoding when source is a stream.")))
145 (cond ((and (eql byte-0 #xfe)
148 ((and (eql byte-0 #xff)
151 ((and (eql byte-0 #xef)
156 ;; 12.2.2.3 Changing the encoding while parsing
157 (defun html5-stream-change-encoding (stream new-encoding)
158 (setf new-encoding (find-encoding new-encoding))
159 (with-slots (encoding char-stream) stream
161 (when (member (car encoding) '(:utf-16le :utf-16be))
162 (setf encoding (cons (car encoding) :certain))
163 (return-from html5-stream-change-encoding))
166 (when (member new-encoding '(:utf-16le :utf-16be))
167 (setf new-encoding :utf-8))
170 (when (eql (car encoding) new-encoding)
171 (setf encoding (cons (car encoding) :certain))
172 (return-from html5-stream-change-encoding))
176 ;; 5. Restart paring from scratch
177 (setf encoding (cons new-encoding :certain))
178 (open-char-stream stream)
179 (throw 'please-reparse t)))
181 (defun html5-stream-char (stream)
182 (with-slots (chunk chunk-offset) stream
183 (when (>= chunk-offset (length chunk))
184 (unless (read-chunk stream)
185 (return-from html5-stream-char +eof+)))
186 (prog1 (char chunk chunk-offset)
187 (incf chunk-offset))))
189 (defun our-scan (chars opposite-p chunk &key start)
190 (loop for i from start below (length chunk)
191 for char = (char chunk i)
193 (position char chars)
194 (not (position char chars)))
197 (defun html5-stream-chars-until (stream characters &optional opposite-p)
198 "Returns a string of characters from the stream up to but not
199 including any character in characters or end of file.
201 (with-slots (chunk chunk-offset) stream
202 (declare (array-length chunk-offset) (chunk chunk))
203 (with-output-to-string (data)
204 (loop for end = (our-scan characters opposite-p chunk :start chunk-offset) do
205 ;; If nothing matched, and it wasn't because we ran out of chunk,
208 (/= chunk-offset (length chunk)))
210 ;; If not the whole chunk matched, return everything
211 ;; up to the part that didn't match
213 (/= chunk-offset (length chunk)))
214 (write-string chunk data :start chunk-offset :end end)
215 (setf chunk-offset end)
217 ;; If the whole remainder of the chunk matched,
218 ;; use it all and read the next chunk
219 (write-string chunk data :start chunk-offset)
220 (unless (read-chunk stream)
223 (defun html5-stream-unget (stream char)
224 (with-slots (chunk chunk-offset) stream
225 (unless (eql char +eof+)
226 (cond ((zerop chunk-offset)
227 (cond ((< (fill-pointer chunk) (array-dimension chunk 0))
228 (incf (fill-pointer chunk))
229 (replace chunk chunk :start1 1))
231 (let ((new-chunk (make-array (1+ (array-dimension chunk 0))
232 :element-type 'character
233 :fill-pointer (1+ (fill-pointer chunk)))))
234 (replace new-chunk chunk :start1 1)
235 (setf chunk new-chunk))))
236 (setf (char chunk 0) char))
239 (assert (char= char (char chunk chunk-offset))))))))
241 (defun read-chunk (stream)
242 (declare (optimize speed))
243 (with-slots (char-stream chunk chunk-offset pending-cr) stream
244 (declare (array-length chunk-offset)
246 (setf chunk-offset 0)
249 (setf (char chunk 0) #\Return)
251 (setf pending-cr nil))
253 (setf (fill-pointer chunk) (array-dimension chunk 0))
254 (handle-encoding-errors stream
255 (setf (fill-pointer chunk) (read-sequence chunk char-stream :start start)))
257 (unless (zerop (length chunk))
259 ;; check if last char is CR and EOF was not reached
260 (when (and (= (length chunk) (array-dimension chunk 0))
261 (eql (char chunk (1- (length chunk))) #\Return))
263 (decf (fill-pointer chunk)))
265 (report-character-errors stream chunk)
267 ;; Python code replaces surrugate pairs with U+FFFD here. Why?
269 ;; Normalize line endings (CR LF)
270 (loop for previous = nil then current
271 for current across chunk
272 for index of-type array-length from 0
273 with offset of-type array-length = 0
274 do (unless (and (eql previous #\Return)
275 (eql current #\Newline))
276 (unless (= index offset)
277 (setf (char chunk offset) current))
278 (when (eql current #\Return)
279 (setf (char chunk offset) #\Newline))
281 finally (setf (fill-pointer chunk) offset))
285 (defun char-range (char1 char2)
286 (loop for i from (char-code char1) to (char-code char2)
287 collect (code-char i)))
289 (defparameter *invalid-unicode*
290 `(,@(char-range #\u0001 #\u0008)
292 ,@(char-range #\u000E #\u001F)
293 ,@(char-range #\u007F #\u009F)
294 ;; The following are noncharacter as defined by Unicode.
295 ;; Clozure Common Lisp doesn't like them.
297 ,@(char-range #\uD800 #\uDFFF)
298 ,@(char-range #\uFDD0 #\uFDEF)
334 (defparameter *invalid-unicode-hash* (make-hash-table))
335 (dolist (char *invalid-unicode*)
336 (setf (gethash char *invalid-unicode-hash*) char))
338 (defun report-character-errors (stream data)
339 (loop for char across data
340 when (gethash char *invalid-unicode-hash*)
341 do (push :invalid-codepoint (html5-stream-errors stream))))
344 (defclass html-tokenizer ()
345 ((stream :initarg :stream :reader tokenizer-stream)
346 (cdata-switch-helper :initarg :cdata-switch-helper
347 :initform (constantly nil))
348 (lowercase-element-name :initform t)
349 (lowercase-attr-name :initform t)
350 (escape-flag :initform nil)
351 (last-four-chars :initform nil)
352 (state :initform :data-state :accessor tokenizer-state)
353 (escape :initform nil)
354 (current-token :initform nil)
355 (token-queue :initform nil)
356 (temporary-buffer :initform nil)))
358 (defun make-html-tokenizer (source &key encoding cdata-switch-helper)
359 (make-instance 'html-tokenizer
360 :stream (make-html-input-stream source :override-encoding encoding)
361 :cdata-switch-helper cdata-switch-helper))
363 (defun map-tokens (tokenizer function)
364 "Return next token or NIL on eof"
365 (with-slots (token-queue stream) tokenizer
366 (loop while (run-state tokenizer) do
367 (setf token-queue (nreverse token-queue))
368 (loop while (html5-stream-errors stream)
369 do (funcall function (list :type :parse-error :data (pop (html5-stream-errors stream)))))
370 (loop while token-queue
371 do (funcall function (pop token-queue))))))
373 (defun run-state (tokenizer)
374 (run-state* tokenizer (slot-value tokenizer 'state)))
376 (defgeneric run-state* (tokenizer state))
378 (defmacro defstate (state (&rest slots) &body body)
379 `(defmethod run-state* (self (state (eql ,state)))
380 (with-slots (,@slots) self
385 (defun push-token (self token)
386 (with-slots (token-queue) self
387 (push token token-queue)))
389 (defun make-growable-string (&optional (init ""))
390 "Make an adjustable string with a fill pointer.
391 Given INIT, a string, return an adjustable version of it with the fill
394 (make-array (max 5 (length init))
395 :element-type 'character
397 :fill-pointer (length init))))
399 (replace string init))
402 (defun nconcat (string &rest data)
403 "Destructively concatenate DATA, string designators, to STRING."
404 (declare (optimize speed))
405 (unless (array-has-fill-pointer-p string)
406 (setf string (make-growable-string string)))
407 (labels ((conc (string x)
410 (vector-push-extend x string))
412 (let ((len (length x)))
413 (loop for c across x do
414 (vector-push-extend c string len))))
415 (symbol (conc string (string x))))))
416 (dolist (x data string)
419 (define-modify-macro nconcatf (&rest data) nconcat)
421 (defun push-token* (self type &rest data)
422 "Push a token with :type type and :data the a string concatenation of data"
423 (push-token self (list :type type
424 :data (apply #'nconcat (make-growable-string) data))))
426 (defun add-attribute (token name)
427 (setf (getf token :data) (append (getf token :data)
428 (list (cons (make-growable-string (string name))
429 (make-growable-string))))))
431 (defun add-to-attr-name (token &rest data)
432 (setf (caar (last (getf token :data)))
434 (caar (last (getf token :data)))
437 (defun add-to-attr-value (token &rest data)
438 (setf (cdar (last (getf token :data)))
440 (cdar (last (getf token :data)))
443 (defun add-to (token indicator &rest data)
444 (setf (getf token indicator)
446 (getf token indicator)
449 (defun consume-number-entity (self is-hex)
450 "This function returns either U+FFFD or the character based on the
451 decimal or hexadecimal representation. It also discards \";\" if present.
452 If not present a token (:type :parse-error) is emitted.
454 (with-slots (stream) self
455 (let ((allowed +digits+)
462 (setf allowed +hex-digits+)
465 ;; Consume all the characters that are in range while making sure we
467 (setf c (html5-stream-char stream))
468 (loop while (and (find c allowed) (not (eql c +eof+))) do
470 (setf c (html5-stream-char stream)))
472 ;; Convert the set of characters consumed to an int.
473 (setf char-as-int (parse-integer (coerce (nreverse char-stack) 'string) :radix radix))
475 ;; Certain characters get replaced with others
476 (cond ((find char-as-int +replacement-characters+)
477 (setf char (getf +replacement-characters+ char-as-int))
478 (push-token self `(:type :parse-error
479 :data :illegal-codepoint-for-numeric-entity
480 :datavars '(:char-as-int ,char-as-int))))
481 ((or (<= #xD800 char-as-int #xDFFF)
482 (> char-as-int #x10FFFF))
484 (push-token self `(:type :parse-error
485 :data :illegal-codepoint-for-numeric-entity
486 :datavars '(:char-as-int ,char-as-int))))
488 ;; Python comment: Should speed up this check somehow (e.g. move the set to a constant)
489 (when (or (<= #x0001 char-as-int #x0008)
490 (<= #x000E char-as-int #x001F)
491 (<= #x007F char-as-int #x009F)
492 (<= #xFDD0 char-as-int #xFDEF)
494 #(#x000B #xFFFE #xFFFF #x1FFFE
495 #x1FFFF #x2FFFE #x2FFFF #x3FFFE
496 #x3FFFF #x4FFFE #x4FFFF #x5FFFE
497 #x5FFFF #x6FFFE #x6FFFF #x7FFFE
498 #x7FFFF #x8FFFE #x8FFFF #x9FFFE
499 #x9FFFF #xAFFFE #xAFFFF #xBFFFE
500 #xBFFFF #xCFFFE #xCFFFF #xDFFFE
501 #xDFFFF #xEFFFE #xEFFFF #xFFFFE
502 #xFFFFF #x10FFFE #x10FFFF)))
503 (push-token self `(:type :parse-error
504 :data :illegal-codepoint-for-numeric-entity
505 :datavars '(:char-as-int ,char-as-int))))
506 ;; Assume char-code-limit >= 1114112
507 (setf char (code-char char-as-int))))
509 ;; Discard the ; if present. Otherwise, put it back on the queue and
510 ;; invoke parseError on parser.
512 (push-token self `(:type :parse-error :data :numeric-entity-without-semicolon))
513 (html5-stream-unget stream c))
517 (defun consume-entity (self &key allowed-char from-attribute)
518 (with-slots (stream current-token) self
520 (stack (list (html5-stream-char stream))))
521 (cond ((or (find (car stack) +space-characters+)
522 (find (car stack) '(+eof+ #\< #\&))
523 (and allowed-char (eql allowed-char (car stack))))
524 (html5-stream-unget stream (car stack)))
525 ((eql (car stack) #\#)
526 (push (html5-stream-char stream) stack)
527 (let ((is-hex (find (car stack) "xX")))
529 (push (html5-stream-char stream) stack))
530 (cond ((find (car stack) (if is-hex +hex-digits+ +digits+))
531 (html5-stream-unget stream (car stack))
532 (setf output (consume-number-entity self is-hex)))
534 (push-token self '(:type :parse-error :data :expected-numeric-entity))
535 (html5-stream-unget stream (pop stack))
537 (html5-stream-unget stream (pop stack)))
538 (html5-stream-unget stream (pop stack))))))
540 ;; Consume the maximum number of characters possible, with the
541 ;; consumed characters matching one of the identifiers in the first
542 ;; column of the named character references table
543 ;; (in a case-sensitive manner).
546 (loop with node = *entities-tree*
547 for char = (car stack) then (car (push (html5-stream-char stream)
549 for next-node = (assoc char node)
551 do (when (second next-node)
552 (setf entity (second next-node))
553 (setf match-at (length stack)))
554 do (setf node (cddr next-node)))
556 ;; Unconsume those characters that are not part of the match
557 ;; This unconsumes everything if there where no match
558 (loop until (= (length stack) match-at) do
559 (setf next-char (car stack))
560 (html5-stream-unget stream (pop stack)))
562 ;; If no match can be made, then no characters are consumed, and nothing is returned.
563 ;; Is this always a parse error really?
564 (push-token self '(:type :parse-error :data :expected-named-entity)))
566 (not (eql #\; (car stack)))
567 (or (eql next-char #\=)
568 (find next-char +digits+)
569 (ascii-letter-p next-char)))
570 ; Is this a parse error really?
571 (push-token self '(:type :parse-error :data :bogus))
572 (setf output (concatenate 'string "&" (reverse stack))))
574 (unless (eql #\; (car stack))
575 (push-token self '(:type :parse-error
576 :data :named-entity-without-semicolon)))
577 (setf output entity)))))))
579 (cond (from-attribute
580 (add-to-attr-value current-token output))
582 (push-token* self (if (find (char output 0) +space-characters+)
587 (defun process-entity-in-attribute (self &key allowed-char)
588 (consume-entity self :allowed-char allowed-char :from-attribute t))
590 (defun emit-current-token (self)
591 "This method is a generic handler for emitting the tags. It also sets
592 the state to :data because that's what's needed after a token has been
595 (with-slots (current-token state lowercase-element-name) self
596 (let ((token current-token))
597 ;; Add token to the queue to be yielded
598 (when (find (getf token :type) +tag-token-types+)
599 (when lowercase-element-name
600 (setf (getf token :name) (ascii-upper-2-lower (getf token :name))))
601 (when (eql (getf token :type) :end-tag)
602 (when (getf token :data)
603 (push-token self '(:type :parse-error :data :attributes-in-end-tag)))
604 (when (getf token :self-closing)
605 (push-token self '(:type :parse-error :data :self-closing-flag-on-end-tag)))))
606 (push-token self token)
607 (setf state :data-state))))
610 ;;; Below are the various tokenizer states worked out.
613 (defstate :data-state (stream state)
614 (let ((data (html5-stream-char stream)))
615 (cond ((eql data #\&)
616 (setf state :entity-data-state))
618 (setf state :tag-open-state))
620 (push-token self '(:type :parse-error :data :invalid-codepoint))
621 (push-token* self :characters #\u0000))
623 ;; Tokenization ends.
625 ((find data +space-characters+)
626 ;; Directly after emitting a token you switch back to the "data
627 ;; state". At that point spaceCharacters are important so they are
628 ;; emitted separately.
629 (push-token* self :space-characters
631 (html5-stream-chars-until stream +space-characters+ t))
632 ;; No need to update lastFourChars here, since the first space will
633 ;; have already been appended to lastFourChars and will have broken
634 ;; any <!-- or --> sequences
637 (push-token* self :characters
639 (html5-stream-chars-until stream '(#\& #\< #\u0000)))))))
641 (defstate :entity-data-state (state)
642 (consume-entity self)
643 (setf state :data-state))
645 (defstate :rcdata-state (stream state)
646 (let ((data (html5-stream-char stream)))
647 (cond ((eql data #\&)
648 (setf state :character-reference-in-rcdata))
650 (setf state :rcdata-less-than-sign-state))
652 ;; Tokenization ends.
655 (push-token self '(:type :parse-error :data :invalid-codepoint))
656 (push-token* self :characters #\uFFFD))
657 ((find data +space-characters+)
658 ;; Directly after emitting a token you switch back to the "data
659 ;; state". At that point spaceCharacters are important so they are
660 ;; emitted separately.
661 (push-token* self :space-characters
663 (html5-stream-chars-until stream +space-characters+ t))
664 ;; No need to update lastFourChars here, since the first space will
665 ;; have already been appended to lastFourChars and will have broken
666 ;; any <!-- or --> sequences
669 (push-token* self :characters
671 (html5-stream-chars-until stream '(#\& #\<)))))))
673 (defstate :character-reference-in-rcdata (state)
674 (consume-entity self)
675 (setf state :rcdata-state))
677 (defstate :rawtext-state (stream state)
678 (let ((data (html5-stream-char stream)))
679 (cond ((eql data #\<)
680 (setf state :rawtext-less-than-sign-state))
682 (push-token self '(:type :parse-error :data :invalid-codepoint))
683 (push-token* self :characters #\uFFFD))
685 ;; Tokenization ends.
688 (push-token* self :characters
690 (html5-stream-chars-until stream '(#\< #\u0000)))))))
692 (defstate :script-data-state (stream state)
693 (let ((data (html5-stream-char stream)))
694 (cond ((eql data #\<)
695 (setf state :script-data-less-than-sign-state))
697 (push-token self '(:type :parse-error :data :invalid-codepoint))
698 (push-token* self :characters #\uFFFD))
700 ;; Tokenization ends.
703 (push-token* self :characters
705 (html5-stream-chars-until stream '(#\< #\u0000)))))))
707 (defstate :plaintext-state (stream)
708 (let ((data (html5-stream-char stream)))
709 (cond ((eql data +eof+)
710 ;; Tokenization ends.
713 (push-token self '(:type :parse-error :data :invalid-codepoint))
714 (push-token* self :characters #\uFFFD))
716 (push-token* self :characters
718 (html5-stream-chars-until stream '(#\u0000)))))))
720 (defstate :tag-open-state (stream state current-token)
721 (let ((data (html5-stream-char stream)))
722 (cond ((eql data #\!)
723 (setf state :markup-declaration-open-state))
725 (setf state :close-tag-open-state))
726 ((ascii-letter-p data)
727 (setf current-token (list :type :start-tag
728 :name (make-array 1 :element-type 'character
729 :initial-element data
734 :self-closing-acknowledged nil))
735 (setf state :tag-name-state))
737 ;; XXX In theory it could be something besides a tag name. But
738 ;; do we really care?
739 (push-token self '(:type :parse-error :data :expected-tag-name-but-got-right-bracket))
740 (push-token* self :characters "<>")
741 (setf state :data-state))
743 ;; XXX In theory it could be something besides a tag name. But
744 ;; do we really care?
745 (push-token self '(:type :parse-error :data :expected-tag-name-but-got-question-mark))
746 (html5-stream-unget stream data)
747 (setf state :bogus-comment-state))
750 (push-token self '(:type :parse-error :data :expected-tag-name))
751 (push-token* self :characters "<")
752 (html5-stream-unget stream data)
753 (setf state :data-state)))))
755 (defstate :close-tag-open-state
756 (stream state current-token)
757 (let ((data (html5-stream-char stream)))
758 (cond ((ascii-letter-p data)
759 (setf current-token (list :type :end-tag
760 :name (make-array 1 :element-type 'character
761 :initial-element data
766 (setf state :tag-name-state))
768 (push-token self '(:type :parse-error :data :expected-closing-tag-but-got-right-bracket))
769 (setf state :data-state))
771 (push-token self '(:type :parse-error :data :expected-closing-tag-but-got-eof))
772 (push-token* self :characters "</")
773 (setf state :data-state))
775 ;; XXX data can be _'_...
776 (push-token self `(:type :parse-error :data :expected-closing-tag-but-got-char
777 :datavars (:data ,data)))
778 (html5-stream-unget stream data)
779 (setf state :bogus-comment-state))))
782 (defstate :tag-name-state (stream state current-token)
783 (let ((data (html5-stream-char stream)))
784 (cond ((find data +space-characters+)
785 (setf state :before-attribute-name-state))
787 (emit-current-token self))
789 (push-token self '(:type :parse-error :data :eof-in-tag-name))
790 (setf state :data-state))
792 (setf state :self-closing-start-tag-state))
794 (push-token self '(:type :parse-error :data :invalid-codepoint))
795 (vector-push-extend #\uFFFD (getf current-token :name)))
797 (vector-push-extend data (getf current-token :name))
798 ;; (Don't use charsUntil here, because tag names are
799 ;; very short and it's faster to not do anything fancy)
802 (defstate :rcdata-less-than-sign-state (stream state temporary-buffer)
803 (let ((data (html5-stream-char stream)))
804 (cond ((eql data #\/)
805 (setf temporary-buffer (make-growable-string))
806 (setf state :rcdata-end-tag-open-state))
808 (push-token* self :characters "<")
809 (html5-stream-unget stream data)
810 (setf state :rcdata-state)))))
812 (defstate :rcdata-end-tag-open-state (stream state temporary-buffer)
813 (let ((data (html5-stream-char stream)))
814 (cond ((ascii-letter-p data)
815 (nconcatf temporary-buffer (string data))
816 (setf state :rcdata-end-tag-name-state))
818 (push-token* self :characters "</")
819 (html5-stream-unget stream data)
820 (setf state :rcdata-state)))))
822 (defstate :rcdata-end-tag-name-state (stream state temporary-buffer current-token)
823 (let ((appropriate (and current-token
824 (string-equal (getf current-token :name)
826 (data (html5-stream-char stream)))
827 (cond ((and (find data +space-characters+)
829 (setf current-token (list :type :end-tag
830 :name temporary-buffer
833 (setf state :before-attribute-name-state))
836 (setf current-token (list :type :end-tag
837 :name temporary-buffer
840 (setf state :self-closing-start-tag-state))
843 (setf current-token (list :type :end-tag
844 :name temporary-buffer
847 (emit-current-token self)
848 (setf state :data-state))
849 ((ascii-letter-p data)
850 (nconcatf temporary-buffer data))
852 (push-token* self :characters "</" temporary-buffer)
853 (html5-stream-unget stream data)
854 (setf state :rcdata-state)))))
856 (defstate :rawtext-less-than-sign-state (stream state temporary-buffer)
857 (let ((data (html5-stream-char stream)))
858 (cond ((eql data #\/)
859 (setf temporary-buffer (make-growable-string))
860 (setf state :rawtext-end-tag-open-state))
862 (push-token* self :characters "<")
863 (html5-stream-unget stream data)
864 (setf state :rawtext-state)))))
866 (defstate :rawtext-end-tag-open-state (stream state temporary-buffer)
867 (let ((data (html5-stream-char stream)))
868 (cond ((ascii-letter-p data)
869 (nconcatf temporary-buffer (string data))
870 (setf state :rawtext-end-tag-name-state))
872 (push-token* self :characters "</")
873 (html5-stream-unget stream data)
874 (setf state :rawtext-state)))))
876 (defstate :rawtext-end-tag-name-state (stream state temporary-buffer current-token)
877 (let ((appropriate (and current-token
878 (string-equal (getf current-token :name)
880 (data (html5-stream-char stream)))
881 (cond ((and (find data +space-characters+)
883 (setf current-token (list :type :end-tag
884 :name temporary-buffer
887 (setf state :before-attribute-name-state))
890 (setf current-token (list :type :end-tag
891 :name temporary-buffer
894 (setf state :self-closing-start-tag-state))
897 (setf current-token (list :type :end-tag
898 :name temporary-buffer
901 (emit-current-token self)
902 (setf state :data-state))
903 ((ascii-letter-p data)
904 (nconcatf temporary-buffer data))
906 (push-token* self :characters "</" temporary-buffer)
907 (html5-stream-unget stream data)
908 (setf state :rawtext-state)))))
910 (defstate :script-data-less-than-sign-state (stream state temporary-buffer)
911 (let ((data (html5-stream-char stream)))
912 (cond ((eql data #\/)
913 (setf temporary-buffer (make-growable-string))
914 (setf state :script-data-end-tag-open-state))
916 (push-token* self :characters "<!")
917 (setf state :script-data-escape-start-state))
919 (push-token* self :characters "<")
920 (html5-stream-unget stream data)
921 (setf state :script-data-state)))))
923 (defstate :script-data-end-tag-open-state (stream state temporary-buffer)
924 (let ((data (html5-stream-char stream)))
925 (cond ((ascii-letter-p data)
926 (nconcatf temporary-buffer data)
927 (setf state :script-data-end-tag-name-state))
929 (push-token* self :characters "</")
930 (html5-stream-unget stream data)
931 (setf state :script-data-state)))))
933 (defstate :script-data-end-tag-name-state (stream state temporary-buffer current-token)
934 (let ((appropriate (and current-token
935 (string-equal (getf current-token :name)
937 (data (html5-stream-char stream)))
938 (cond ((and (find data +space-characters+)
940 (setf current-token (list :type :end-tag
941 :name temporary-buffer
944 (setf state :before-attribute-name-state))
947 (setf current-token (list :type :end-tag
948 :name temporary-buffer
951 (setf state :self-closing-start-tag-state))
954 (setf current-token (list :type :end-tag
955 :name temporary-buffer
958 (emit-current-token self)
959 (setf state :data-state))
960 ((ascii-letter-p data)
961 (nconcatf temporary-buffer data))
963 (push-token* self :characters "</" temporary-buffer)
964 (html5-stream-unget stream data)
965 (setf state :script-data-state)))))
967 (defstate :script-data-escape-start-state (stream state)
968 (let ((data (html5-stream-char stream)))
969 (cond ((eql data #\-)
970 (push-token* self :characters "-")
971 (setf state :script-data-escape-start-dash-state))
973 (html5-stream-unget stream data)
974 (setf state :script-data-state)))))
976 (defstate :script-data-escape-start-dash-state (stream state)
977 (let ((data (html5-stream-char stream)))
978 (cond ((eql data #\-)
979 (push-token* self :characters "-")
980 (setf state :script-data-escaped-dash-dash-state))
982 (html5-stream-unget stream data)
983 (setf state :script-data-state)))))
985 (defstate :script-data-escaped-state (stream state)
986 (let ((data (html5-stream-char stream)))
987 (cond ((eql data #\-)
988 (push-token* self :characters "-")
989 (setf state :script-data-escaped-dash-state))
991 (setf state :script-data-escaped-less-than-sign-state))
993 (push-token self '(:type :parse-error :data :invalid-codepoint))
994 (push-token* self :characters #\uFFFD))
996 (setf state :data-state))
998 (push-token* self :characters data (html5-stream-chars-until stream '(#\< #\- #\u0000)))))))
1000 (defstate :script-data-escaped-dash-state (stream state)
1001 (let ((data (html5-stream-char stream)))
1002 (cond ((eql data #\-)
1003 (push-token* self :characters "-")
1004 (setf state :script-data-escaped-dash-dash-state))
1006 (setf state :script-data-escaped-less-than-sign-state))
1008 (push-token self '(:type :parse-error :data :invalid-codepoint))
1009 (push-token* self :characters #\uFFFD)
1010 (setf state :script-data-escaped-state))
1012 (setf state :data-state))
1014 (push-token* self :characters data (html5-stream-chars-until stream '(#\< #\- #\u0000)))
1015 (setf state :script-data-escaped-state)))))
1017 (defstate :script-data-escaped-dash-dash-state (stream state)
1018 (let ((data (html5-stream-char stream)))
1019 (cond ((eql data #\-)
1020 (push-token* self :characters "-"))
1022 (setf state :script-data-escaped-less-than-sign-state))
1024 (push-token* self :characters ">")
1025 (setf state :script-data-state))
1027 (push-token self '(:type :parse-error :data :invalid-codepoint))
1028 (push-token* self :characters #\uFFFD)
1029 (setf state :script-data-escaped-state))
1031 (setf state :data-state))
1033 (push-token* self :characters data (html5-stream-chars-until stream '(#\< #\- #\u0000)))
1034 (setf state :script-data-escaped-state)))))
1036 (defstate :script-data-escaped-less-than-sign-state (stream state temporary-buffer)
1037 (let ((data (html5-stream-char stream)))
1038 (cond ((eql data #\/)
1039 (setf temporary-buffer (make-growable-string))
1040 (setf state :script-data-escaped-end-tag-open-state))
1041 ((ascii-letter-p data)
1042 (push-token* self :characters "<" data)
1043 (setf temporary-buffer (ascii-upper-2-lower (string data)))
1044 (setf state :script-data-double-escape-start-state))
1046 (push-token* self :characters "<")
1047 (html5-stream-unget stream data)
1048 (setf state :script-data-escaped-state)))))
1050 (defstate :script-data-escaped-end-tag-open-state (stream state temporary-buffer)
1051 (let ((data (html5-stream-char stream)))
1052 (cond ((ascii-letter-p data)
1053 (setf temporary-buffer (string data))
1054 (setf state :script-data-escaped-end-tag-name-state))
1056 (push-token* self :characters "</")
1057 (html5-stream-unget stream data)
1058 (setf state :script-data-escaped-state)))))
1060 (defstate :script-data-escaped-end-tag-name-state (stream state temporary-buffer current-token)
1061 (let ((appropriate (and current-token
1062 (string-equal (getf current-token :name)
1064 (data (html5-stream-char stream)))
1065 (cond ((and (find data +space-characters+)
1067 (setf current-token (list :type :end-tag
1068 :name temporary-buffer
1071 (setf state :before-attribute-name-state))
1072 ((and (eql data #\/)
1074 (setf current-token (list :type :end-tag
1075 :name temporary-buffer
1078 (setf state :self-closing-start-tag-state))
1079 ((and (eql data #\>)
1081 (setf current-token (list :type :end-tag
1082 :name temporary-buffer
1085 (emit-current-token self)
1086 (setf state :data-state))
1087 ((ascii-letter-p data)
1088 (nconcatf temporary-buffer data))
1090 (push-token* self :characters "</" temporary-buffer)
1091 (html5-stream-unget stream data)
1092 (setf state :script-data-escaped-state)))))
1094 (defstate :script-data-double-escape-start-state (stream state temporary-buffer)
1095 (let ((data (html5-stream-char stream)))
1096 (cond ((or (find data +space-characters+)
1097 (find data '(#\/ #\>)))
1098 (push-token* self :characters data)
1099 (if (string= (string-downcase temporary-buffer) "script")
1100 (setf state :script-data-double-escaped-state)
1101 (setf state :script-data-escaped-state)))
1102 ((ascii-letter-p data)
1103 (push-token* self :characters data)
1104 (nconcatf temporary-buffer (string data)))
1106 (html5-stream-unget stream data)
1107 (setf state :script-data-escaped-state)))))
1109 (defstate :script-data-double-escaped-state (stream state)
1110 (let ((data (html5-stream-char stream)))
1111 (cond ((eql data #\-)
1112 (push-token* self :characters "-")
1113 (setf state :script-data-double-escaped-dash-state))
1115 (push-token* self :characters "<")
1116 (setf state :script-data-double-escaped-less-than-sign-state))
1118 (push-token self '(:type :parse-error :data :invalid-codepoint))
1119 (push-token* self :characters #\uFFFD))
1121 (push-token self '(:type :parse-error :data :eof-in-script-in-script))
1122 (setf state :data-state))
1124 (push-token* self :characters data)))))
1126 (defstate :script-data-double-escaped-dash-state (stream state)
1127 (let ((data (html5-stream-char stream)))
1128 (cond ((eql data #\-)
1129 (push-token* self :characters "-")
1130 (setf state :script-data-double-escaped-dash-dash-state))
1132 (push-token* self :characters "<")
1133 (setf state :script-data-double-escaped-less-than-sign-state))
1135 (push-token self '(:type :parse-error :data :invalid-codepoint))
1136 (push-token* self :characters #\uFFFD)
1137 (setf state :script-data-double-escaped-state))
1139 (push-token self '(:type :parse-error :data :eof-in-script-in-script))
1140 (setf state :data-state))
1142 (push-token* self :characters data)
1143 (setf state :script-data-double-escaped-state)))))
1145 ;; FIXME: Incorrectly named in Python code: scriptDataDoubleEscapedDashState (same the one above)
1146 (defstate :script-data-double-escaped-dash-dash-state (stream state)
1147 (let ((data (html5-stream-char stream)))
1148 (cond ((eql data #\-)
1149 (push-token* self :characters "-")
1150 (setf state :script-data-double-escaped-dash-dash-state))
1152 (push-token* self :characters "<")
1153 (setf state :script-data-double-escaped-less-than-sign-state))
1155 (push-token* self :characters ">")
1156 (setf state :script-data-state))
1158 (push-token self '(:type :parse-error :data :invalid-codepoint))
1159 (push-token* self :characters #\uFFFD)
1160 (setf state :script-data-double-escaped-state))
1162 (push-token self '(:type :parse-error :data :eof-in-script-in-script))
1163 (setf state :data-state))
1165 (push-token* self :characters data)
1166 (setf state :script-data-double-escaped-state)))))
1168 (defstate :script-data-double-escaped-less-than-sign-state (stream state temporary-buffer)
1169 (let ((data (html5-stream-char stream)))
1170 (cond ((eql data #\/)
1171 (push-token* self :characters "/")
1172 (setf temporary-buffer (make-growable-string))
1173 (setf state :script-data-double-escape-end-state))
1175 (html5-stream-unget stream data)
1176 (setf state :script-data-double-escaped-state)))))
1178 (defstate :script-data-double-escape-end-state (stream state temporary-buffer)
1179 (let ((data (html5-stream-char stream)))
1180 (cond ((or (find data +space-characters+)
1181 (find data '(#\/ #\>)))
1182 (push-token* self :characters data)
1183 (if (string= (string-downcase temporary-buffer) "script")
1184 (setf state :script-data-escaped-state)
1185 (setf state :script-data-double-escaped-state)))
1186 ((ascii-letter-p data)
1187 (push-token* self :characters data)
1188 (nconcatf temporary-buffer data))
1190 (html5-stream-unget stream data)
1191 (setf state :script-data-double-escaped-state)))))
1193 (defstate :before-attribute-name-state (stream state current-token)
1194 (let ((data (html5-stream-char stream)))
1195 (cond ((find data +space-characters+)
1196 (html5-stream-chars-until stream +space-characters+ t))
1197 ((ascii-letter-p data)
1198 (add-attribute current-token data)
1199 (setf state :attribute-name-state))
1201 (emit-current-token self))
1203 (setf state :self-closing-start-tag-state))
1204 ((find data '(#\' #\" #\= #\<))
1205 (push-token self '(:type :parse-error :data :invalid-character-in-attribute-name))
1206 (add-attribute current-token data)
1207 (setf state :attribute-name-state))
1209 (push-token self '(:type :parse-error :data :invalid-codepoint))
1210 (add-attribute current-token #\uFFFD)
1211 (setf state :attribute-name-state))
1213 (push-token self '(:type :parse-error :data :expected-attribute-name-but-got-eof))
1214 (setf state :data-state))
1216 (add-attribute current-token data)
1217 (setf state :attribute-name-state)))))
1219 (defstate :attribute-name-state (stream state current-token lowercase-attr-name)
1220 (let ((data (html5-stream-char stream))
1221 (leaving-this-state t)
1223 (cond ((eql data #\=)
1224 (setf state :before-attribute-value-state))
1225 ((ascii-letter-p data)
1226 (add-to-attr-name current-token data
1227 (html5-stream-chars-until stream +ascii-letters+ t))
1228 (setf leaving-this-state nil))
1230 ;; XXX If we emit here the attributes are converted to a dict
1231 ;; without being checked and when the code below runs we error
1232 ;; because data is a dict not a list
1233 (setf emit-token t))
1234 ((find data +space-characters+)
1235 (setf state :after-attribute-name-state))
1237 (setf state :self-closing-start-tag-state))
1239 (push-token self '(:type :parse-error :data :invalid-codepoint))
1240 (add-to-attr-name current-token #\uFFFD)
1241 (setf leaving-this-state nil))
1242 ((find data '(#\' #\" #\<))
1243 (push-token self '(:type :parse-error :data :invalid-character-in-attribute-name))
1244 (add-to-attr-name current-token data)
1245 (setf leaving-this-state nil))
1247 (push-token self '(:type :parse-error :data :eof-in-attribute-name))
1248 (setf state :data-state))
1250 (add-to-attr-name current-token data)
1251 (setf leaving-this-state nil)))
1252 (when leaving-this-state
1253 ;; Attributes are not dropped at this stage. That happens when the
1254 ;; start tag token is emitted so values can still be safely appended
1255 ;; to attributes, but we do want to report the parse error in time.
1256 (when lowercase-attr-name
1257 (setf (caar (last (getf current-token :data)))
1258 (ascii-upper-2-lower (caar (last (getf current-token :data))))))
1259 (loop for (name . value) in (butlast (getf current-token :data)) do
1260 (when (string= (caar (last (getf current-token :data))) name)
1261 (push-token self '(:type :parse-error :data :duplicate-attribute))
1263 ;; XXX Fix for above XXX
1265 (emit-current-token self)))))
1267 (defstate :after-attribute-name-state (stream state current-token)
1268 (let ((data (html5-stream-char stream)))
1269 (cond ((find data +space-characters+)
1270 (html5-stream-chars-until stream +space-characters+ t))
1272 (setf state :before-attribute-value-state))
1274 (emit-current-token self))
1275 ((ascii-letter-p data)
1276 (add-attribute current-token data)
1277 (setf state :attribute-name-state))
1279 (setf state :self-closing-start-tag-state))
1281 (push-token self '(:type :parse-error :data :invalid-codepoint))
1282 (add-attribute current-token #\uFFFD)
1283 (setf state :attribute-name-state))
1284 ((find data '(#\' #\" #\<))
1285 (push-token self '(:type :parse-error :data :invalid-character-after-attribute-name))
1286 (add-attribute current-token data)
1287 (setf state :attribute-name-state))
1289 (push-token self '(:type :parse-error :data :expected-end-of-tag-but-got-eof))
1290 (setf state :data-state))
1292 (add-attribute current-token data)
1293 (setf state :attribute-name-state)))))
1295 (defstate :before-attribute-value-state (stream state current-token)
1296 (let ((data (html5-stream-char stream)))
1297 (cond ((find data +space-characters+)
1298 (html5-stream-chars-until stream +space-characters+ t))
1300 (setf state :attribute-value-double-quoted-state))
1302 (setf state :attribute-value-un-quoted-state)
1303 (html5-stream-unget stream data))
1305 (setf state :attribute-value-single-quoted-state))
1307 (push-token self '(:type :parse-error :data :expected-attribute-value-but-got-right-bracket))
1308 (emit-current-token self))
1310 (push-token self '(:type :parse-error :data :invalid-codepoint))
1311 (add-to-attr-value current-token #\uFFFD)
1312 (setf state :attribute-value-un-quoted-state))
1313 ((find data '(#\= #\< #\`))
1314 (push-token self '(:type :parse-error :data :equals-in-unquoted-attribute-value))
1315 (add-to-attr-value current-token data)
1316 (setf state :attribute-value-un-quoted-state))
1318 (push-token self '(:type :parse-error :data :expected-attribute-value-but-got-eof))
1319 (setf state :data-state))
1321 (add-to-attr-value current-token data)
1322 (setf state :attribute-value-un-quoted-state)))))
1324 (defstate :attribute-value-double-quoted-state (stream state current-token)
1325 (let ((data (html5-stream-char stream)))
1326 (cond ((eql data #\")
1327 (setf state :after-attribute-value-state))
1329 (process-entity-in-attribute self :allowed-char #\"))
1331 (push-token self '(:type :parse-error :data :invalid-codepoint))
1332 (add-to-attr-value current-token #\uFFFD))
1334 (push-token self '(:type :parse-error :data :eof-in-attribute-value-double-quote))
1335 (setf state :data-state))
1337 (add-to-attr-value current-token
1339 (html5-stream-chars-until stream '(#\" #\&)))))))
1341 (defstate :attribute-value-single-quoted-state (stream state current-token)
1342 (let ((data (html5-stream-char stream)))
1343 (cond ((eql data #\')
1344 (setf state :after-attribute-value-state))
1346 (process-entity-in-attribute self :allowed-char #\'))
1348 (push-token self '(:type :parse-error :data :invalid-codepoint))
1349 (add-to-attr-value current-token #\uFFFD))
1351 (push-token self '(:type :parse-error :data :eof-in-attribute-value-single-quote))
1352 (setf state :data-state))
1354 (add-to-attr-value current-token
1356 (html5-stream-chars-until stream '(#\' #\&)))))))
1358 (defstate :attribute-value-un-quoted-state (stream state current-token)
1359 (let ((data (html5-stream-char stream)))
1360 (cond ((find data +space-characters+)
1361 (setf state :before-attribute-name-state))
1363 (process-entity-in-attribute self :allowed-char #\>))
1365 (emit-current-token self))
1366 ((find data '(#\" #\' #\= #\< #\`))
1367 (push-token self '(:type :parse-error :data :unexpected-character-in-unquoted-attribute-value))
1368 (add-to-attr-value current-token data))
1370 (push-token self '(:type :parse-error :data :invalid-codepoint))
1371 (add-to-attr-value current-token #\uFFFD))
1373 (push-token self '(:type :parse-error :data :eof-in-attribute-value-no-quotes))
1374 (setf state :data-state))
1376 (add-to-attr-value current-token
1378 (html5-stream-chars-until stream `(#\& #\> #\" #\' #\= #\< #\`
1379 ,@+space-characters+)))))))
1381 (defstate :after-attribute-value-state (stream state current-token)
1382 (let ((data (html5-stream-char stream)))
1383 (cond ((find data +space-characters+)
1384 (setf state :before-attribute-name-state))
1386 (emit-current-token self))
1388 (setf state :self-closing-start-tag-state))
1390 (push-token self '(:type :parse-error :data :unexpected-EOF-after-attribute-value))
1391 (html5-stream-unget stream data)
1392 (setf state :data-state))
1394 (push-token self '(:type :parse-error :data :unexpected-character-after-attribute-value))
1395 (html5-stream-unget stream data)
1396 (setf state :before-attribute-name-state)))))
1398 (defstate :self-closing-start-tag-state (stream state current-token)
1399 (let ((data (html5-stream-char stream)))
1400 (cond ((eql data #\>)
1401 (setf (getf current-token :self-closing) t)
1402 (emit-current-token self))
1404 (push-token self '(:type :parse-error :data :unexpected-EOF-after-solidus-in-tag))
1405 (html5-stream-unget stream data)
1406 (setf state :data-state))
1408 (push-token self '(:type :parse-error :data :unexpected-character-after-soldius-in-tag))
1409 (html5-stream-unget stream data)
1410 (setf state :before-attribute-name-state)))))
1412 (defstate :bogus-comment-state (stream state current-token)
1413 ;; Make a new comment token and give it as value all the characters
1414 ;; until the first > or EOF (charsUntil checks for EOF automatically)
1416 (let ((data (html5-stream-chars-until stream '(#\>))))
1417 (setf data (substitute #\uFFFD #\u0000 data))
1418 (push-token* self :comment data)
1419 ;; Eat the character directly after the bogus comment which is either a
1421 (html5-stream-char stream)
1422 (setf state :data-state)))
1424 (defstate :markup-declaration-open-state (stream state current-token
1425 cdata-switch-helper)
1426 (let ((char-stack (make-array 1
1427 :initial-element (html5-stream-char stream)
1430 (cond ((eql (aref char-stack (1- (length char-stack))) #\-)
1431 (vector-push-extend (html5-stream-char stream) char-stack)
1432 (when (eql (aref char-stack (1- (length char-stack))) #\-)
1433 (setf current-token (list :type :comment :data ""))
1434 (setf state :comment-start-state)
1436 ((find (aref char-stack (1- (length char-stack))) '(#\d #\D))
1438 (loop for expected in '((#\o #\O) (#\c #\C) (#\t #\T) (#\y #\Y) (#\p #\P) (#\e #\E)) do
1439 (vector-push-extend (html5-stream-char stream) char-stack)
1440 (unless (find (aref char-stack (1- (length char-stack))) expected)
1444 (setf current-token (list :type :doctype
1449 (setf state :doctype-state)
1451 ((and (eql (aref char-stack (1- (length char-stack))) #\[)
1452 (funcall cdata-switch-helper))
1454 (loop for expected across "CDATA[" do
1455 (vector-push-extend (html5-stream-char stream) char-stack)
1456 (unless (eql (aref char-stack (1- (length char-stack))) expected)
1460 (setf state :cdata-section-state)
1462 (push-token self '(:type :parse-error :data :expected-dashes-or-doctype))
1463 (loop while (plusp (length char-stack)) do
1464 (html5-stream-unget stream (vector-pop char-stack)))
1465 (setf state :bogus-comment-state)))
1467 (defstate :comment-start-state (stream state current-token)
1468 (let ((data (html5-stream-char stream)))
1469 (cond ((eql data #\-)
1470 (setf state :comment-start-dash-state))
1472 (push-token self '(:type :parse-error :data :invalid-codepoint))
1473 (add-to current-token :data #\uFFFD))
1475 (push-token self '(:type :parse-error :data :incorrect-comment))
1476 (push-token self current-token)
1477 (setf state :data-state))
1479 (push-token self '(:type :parse-error :data :eof-in-comment))
1480 (push-token self current-token)
1481 (setf state :data-state))
1483 (add-to current-token :data data)
1484 (setf state :comment-state)))))
1486 (defstate :comment-start-dash-state (stream state current-token)
1487 (let ((data (html5-stream-char stream)))
1488 (cond ((eql data #\-)
1489 (setf state :comment-end-state))
1491 (push-token self '(:type :parse-error :data :invalid-codepoint))
1492 (add-to current-token :data "-" #\uFFFD))
1494 (push-token self '(:type :parse-error :data :incorrect-comment))
1495 (push-token self current-token)
1496 (setf state :data-state))
1498 (push-token self '(:type :parse-error :data :eof-in-comment))
1499 (push-token self current-token)
1500 (setf state :data-state))
1502 (add-to current-token :data "-" data)
1503 (setf state :comment-state)))))
1505 (defstate :comment-state (stream state current-token)
1506 (let ((data (html5-stream-char stream)))
1507 (cond ((eql data #\-)
1508 (setf state :comment-end-dash-state))
1510 (push-token self '(:type :parse-error :data :invalid-codepoint))
1511 (add-to current-token :data #\uFFFD))
1513 (push-token self '(:type :parse-error :data :eof-in-comment))
1514 (push-token self current-token)
1515 (setf state :data-state))
1517 (add-to current-token :data data
1518 (html5-stream-chars-until stream '(#\- #\u0000)))))))
1520 (defstate :comment-end-dash-state (stream state current-token)
1521 (let ((data (html5-stream-char stream)))
1522 (cond ((eql data #\-)
1523 (setf state :comment-end-state))
1525 (push-token self '(:type :parse-error :data :invalid-codepoint))
1526 (add-to current-token :data "-" #\uFFFD))
1528 (push-token self '(:type :parse-error :data :eof-in-comment-end-dash))
1529 (push-token self current-token)
1530 (setf state :data-state))
1532 (add-to current-token :data "-" data)
1533 (setf state :comment-state)))))
1535 (defstate :comment-end-state (stream state current-token)
1536 (let ((data (html5-stream-char stream)))
1537 (cond ((eql data #\>)
1538 (push-token self current-token)
1539 (setf state :data-state))
1541 (push-token self '(:type :parse-error :data :invalid-codepoint))
1542 (add-to current-token :data "--" #\uFFFD)
1543 (setf state :comment-state))
1545 (push-token self '(:type :parse-error :data :unexpected-bang-after-double-dash-in-comment))
1546 (setf state :comment-end-bang-state))
1548 (push-token self '(:type :parse-error :data :unexpected-dash-after-double-dash-in-comment))
1549 (add-to current-token :data data))
1551 (push-token self '(:type :parse-error :data :eof-in-comment-double-dash))
1552 (push-token self current-token)
1553 (setf state :data-state))
1556 (push-token self '(:type :parse-error :data :unexpected-char-in-comment))
1557 (add-to current-token :data "--" data)
1558 (setf state :comment-state)))))
1560 (defstate :comment-end-bang-state (stream state current-token)
1561 (let ((data (html5-stream-char stream)))
1562 (cond ((eql data #\>)
1563 (push-token self current-token)
1564 (setf state :data-state))
1566 (add-to current-token :data "--!")
1567 (setf state :comment-end-dash-state))
1569 (push-token self '(:type :parse-error :data :invalid-codepoint))
1570 (add-to current-token :data "--!" #\uFFFD)
1571 (setf state :comment-state))
1573 (push-token self '(:type :parse-error :data :eof-in-comment-end-bang-state))
1574 (push-token self current-token)
1575 (setf state :data-state))
1577 (add-to current-token :data "--!" data)
1578 (setf state :comment-state)))))
1580 (defstate :doctype-state (stream state current-token)
1581 (let ((data (html5-stream-char stream)))
1582 (cond ((find data +space-characters+)
1583 (setf state :before-doctype-name-state))
1585 (push-token self '(:type :parse-error :data :expected-doctype-name-but-got-eof))
1586 (setf (getf current-token :correct) nil)
1587 (push-token self current-token)
1588 (setf state :data-state))
1590 (push-token self '(:type :parse-error :data :need-space-after-doctype))
1591 (html5-stream-unget stream data)
1592 (setf state :before-doctype-name-state)))))
1594 (defstate :before-doctype-name-state (stream state current-token)
1595 (let ((data (html5-stream-char stream)))
1596 (cond ((find data +space-characters+)
1600 (push-token self '(:type :parse-error :data :expected-doctype-name-but-got-right-bracket))
1601 (setf (getf current-token :correct) nil)
1602 (push-token self current-token)
1603 (setf state :data-state))
1605 (push-token self '(:type :parse-error :data :invalid-codepoint))
1606 (add-to current-token :name #\uFFFD)
1607 (setf state :doctype-name-state))
1609 (push-token self '(:type :parse-error :data :expected-doctype-name-but-got-eof))
1610 (setf (getf current-token :correct) nil)
1611 (push-token self current-token)
1612 (setf state :data-state))
1614 (setf (getf current-token :name) (string data))
1615 (setf state :doctype-name-state)))))
1617 (defstate :doctype-name-state (stream state current-token)
1618 (let ((data (html5-stream-char stream)))
1619 (cond ((find data +space-characters+)
1620 (setf (getf current-token :name) (ascii-upper-2-lower (getf current-token :name)))
1621 (setf state :after-doctype-name-state))
1623 (setf (getf current-token :name) (ascii-upper-2-lower (getf current-token :name)))
1624 (push-token self current-token)
1625 (setf state :data-state))
1627 (push-token self '(:type :parse-error :data :invalid-codepoint))
1628 (add-to current-token :name #\uFFFD)
1629 (setf state :doctype-name-state))
1631 (push-token self '(:type :parse-error :data :eof-in-doctype-name))
1632 (setf (getf current-token :correct) nil)
1633 (setf (getf current-token :name) (ascii-upper-2-lower (getf current-token :name)))
1634 (push-token self current-token)
1635 (setf state :data-state))
1637 (add-to current-token :name data)))))
1639 (defstate :after-doctype-name-state (stream state current-token)
1640 (let ((data (html5-stream-char stream)))
1641 (cond ((find data +space-characters+)
1645 (push-token self current-token)
1646 (setf state :data-state))
1648 (setf (getf current-token :correct) nil)
1649 (html5-stream-unget stream data)
1650 (push-token self '(:type :parse-error :data :eof-in-doctype))
1651 (push-token self current-token)
1652 (setf state :data-state))
1654 (cond ((find data '(#\p #\P))
1656 (loop for expected in '((#\u #\U) (#\b #\B) (#\l #\L) (#\i #\I) (#\c #\C)) do
1657 (setf data (html5-stream-char stream))
1658 (unless (find data expected)
1662 (setf state :after-doctype-public-keyword-state)
1664 ((find data '(#\s #\S))
1666 (loop for expected in '((#\y #\Y) (#\s #\S) (#\t #\T) (#\e #\E) (#\m #\M)) do
1667 (setf data (html5-stream-char stream))
1668 (unless (find data expected)
1672 (setf state :after-doctype-system-keyword-state)
1674 ;; All the characters read before the current 'data' will be
1675 ;; [a-zA-Z], so they're garbage in the bogus doctype and can be
1676 ;; discarded; only the latest character might be '>' or EOF
1677 ;; and needs to be ungetted
1678 (html5-stream-unget stream data)
1679 (push-token self `(:type :parse-error :data :expected-space-or-right-bracket-in-doctype
1680 :datavars (:data ,data)))
1681 (setf (getf current-token :correct) nil)
1682 (setf state :bogus-doctype-state)))))
1684 (defstate :after-doctype-public-keyword-state (stream state current-token)
1685 (let ((data (html5-stream-char stream)))
1686 (cond ((find data +space-characters+)
1687 (setf state :before-doctype-public-identifier-state))
1688 ((find data '(#\' #\"))
1689 (push-token self '(:type :parse-error :data :unexpected-char-in-doctype))
1690 (html5-stream-unget stream data)
1691 (setf state :before-doctype-public-identifier-state))
1693 (push-token self '(:type :parse-error :data :eof-in-doctype))
1694 (setf (getf current-token :correct) nil)
1695 (push-token self current-token)
1696 (setf state :data-state))
1698 (html5-stream-unget stream data)
1699 (setf state :before-doctype-public-identifier-state)))))
1701 (defstate :before-doctype-public-identifier-state (stream state current-token)
1702 (let ((data (html5-stream-char stream)))
1703 (cond ((find data +space-characters+)
1707 (setf (getf current-token :public-id) "")
1708 (setf state :doctype-public-identifier-double-quoted-state))
1710 (setf (getf current-token :public-id) "")
1711 (setf state :doctype-public-identifier-single-quoted-state))
1713 (push-token self '(:type :parse-error :data :unexpected-end-of-doctype))
1714 (setf (getf current-token :correct) nil)
1715 (push-token self current-token)
1716 (setf state :data-state))
1718 (push-token self '(:type :parse-error :data :eof-in-doctype))
1719 (setf (getf current-token :correct) nil)
1720 (push-token self current-token)
1721 (setf state :data-state))
1723 (push-token self '(:type :parse-error :data :unexpected-char-in-doctype))
1724 (setf (getf current-token :correct) nil)
1725 (setf state :bogus-doctype-state)))))
1727 (defstate :doctype-public-identifier-double-quoted-state (stream state current-token)
1728 (let ((data (html5-stream-char stream)))
1729 (cond ((eql data #\")
1730 (setf state :after-doctype-public-identifier-state))
1732 (push-token self '(:type :parse-error :data :invalid-codepoint))
1733 (add-to current-token :public-id #\uFFFD))
1735 (push-token self '(:type :parse-error :data :unexpected-end-of-doctype))
1736 (setf (getf current-token :correct) nil)
1737 (push-token self current-token)
1738 (setf state :data-state))
1740 (push-token self '(:type :parse-error :data :eof-in-doctype))
1741 (setf (getf current-token :correct) nil)
1742 (push-token self current-token)
1743 (setf state :data-state))
1745 (add-to current-token :public-id data)))))
1747 (defstate :doctype-public-identifier-single-quoted-state (stream state current-token)
1748 (let ((data (html5-stream-char stream)))
1749 (cond ((eql data #\')
1750 (setf state :after-doctype-public-identifier-state))
1752 (push-token self '(:type :parse-error :data :invalid-codepoint))
1753 (add-to current-token :public-id #\uFFFD))
1755 (push-token self '(:type :parse-error :data :unexpected-end-of-doctype))
1756 (setf (getf current-token :correct) nil)
1757 (push-token self current-token)
1758 (setf state :data-state))
1760 (push-token self '(:type :parse-error :data :eof-in-doctype))
1761 (setf (getf current-token :correct) nil)
1762 (push-token self current-token)
1763 (setf state :data-state))
1765 (add-to current-token :public-id data)))))
1767 (defstate :after-doctype-public-identifier-state (stream state current-token)
1768 (let ((data (html5-stream-char stream)))
1769 (cond ((find data +space-characters+)
1770 (setf state :between-doctype-public-and-system-identifiers-state))
1772 (push-token self current-token)
1773 (setf state :data-state))
1775 (push-token self '(:type :parse-error :data :unexpected-char-in-doctype))
1776 (setf (getf current-token :system-id) "")
1777 (setf state :doctype-system-identifier-double-quoted-state))
1779 (push-token self '(:type :parse-error :data :unexpected-char-in-doctype))
1780 (setf (getf current-token :system-id) "")
1781 (setf state :doctype-system-identifier-single-quoted-state))
1783 (push-token self '(:type :parse-error :data :eof-in-doctype))
1784 (setf (getf current-token :correct) nil)
1785 (push-token self current-token)
1786 (setf state :data-state))
1788 (push-token self '(:type :parse-error :data :unexpected-char-in-doctype))
1789 (setf (getf current-token :correct) nil)
1790 (setf state :bogus-doctype-state)))))
1792 (defstate :between-doctype-public-and-system-identifiers-state (stream state current-token)
1793 (let ((data (html5-stream-char stream)))
1794 (cond ((find data +space-characters+)
1798 (push-token self current-token)
1799 (setf state :data-state))
1801 (setf (getf current-token :system-id) "")
1802 (setf state :doctype-system-identifier-double-quoted-state))
1804 (setf (getf current-token :system-id) "")
1805 (setf state :doctype-system-identifier-single-quoted-state))
1807 (push-token self '(:type :parse-error :data :eof-in-doctype))
1808 (setf (getf current-token :correct) nil)
1809 (push-token self current-token)
1810 (setf state :data-state))
1812 (push-token self '(:type :parse-error :data :unexpected-char-in-doctype))
1813 (setf (getf current-token :correct) nil)
1814 (setf state :bogus-doctype-state)))))
1816 (defstate :after-doctype-system-keyword-state (stream state current-token)
1817 (let ((data (html5-stream-char stream)))
1818 (cond ((find data +space-characters+)
1819 (setf state :before-doctype-system-identifier-state))
1820 ((find data '(#\' #\"))
1821 (push-token self '(:type :parse-error :data :unexpected-char-in-doctype))
1822 (html5-stream-unget stream data)
1823 (setf state :before-doctype-system-identifier-state))
1825 (push-token self '(:type :parse-error :data :eof-in-doctype))
1826 (setf (getf current-token :correct) nil)
1827 (push-token self current-token)
1828 (setf state :data-state))
1830 (html5-stream-unget stream data)
1831 (setf state :before-doctype-system-identifier-state)))))
1833 (defstate :before-doctype-system-identifier-state (stream state current-token)
1834 (let ((data (html5-stream-char stream)))
1835 (cond ((find data +space-characters+)
1839 (setf (getf current-token :system-id) "")
1840 (setf state :doctype-system-identifier-double-quoted-state))
1842 (setf (getf current-token :system-id) "")
1843 (setf state :doctype-system-identifier-single-quoted-state))
1845 (push-token self '(:type :parse-error :data :unexpected-end-of-doctype))
1846 (setf (getf current-token :correct) nil)
1847 (push-token self current-token)
1848 (setf state :data-state))
1850 (push-token self '(:type :parse-error :data :eof-in-doctype))
1851 (setf (getf current-token :correct) nil)
1852 (push-token self current-token)
1853 (setf state :data-state))
1855 (push-token self '(:type :parse-error :data :unexpected-char-in-doctype))
1856 (setf (getf current-token :correct) nil)
1857 (setf state :bogus-doctype-state)))))
1859 (defstate :doctype-system-identifier-double-quoted-state (stream state current-token)
1860 (let ((data (html5-stream-char stream)))
1861 (cond ((eql data #\")
1862 (setf state :after-doctype-system-identifier-state))
1864 (push-token self '(:type :parse-error :data :invalid-codepoint))
1865 (add-to current-token :system-id #\uFFFD))
1867 (push-token self '(:type :parse-error :data :unexpected-end-of-doctype))
1868 (setf (getf current-token :correct) nil)
1869 (push-token self current-token)
1870 (setf state :data-state))
1872 (push-token self '(:type :parse-error :data :eof-in-doctype))
1873 (setf (getf current-token :correct) nil)
1874 (push-token self current-token)
1875 (setf state :data-state))
1877 (add-to current-token :system-id data)))))
1879 (defstate :doctype-system-identifier-single-quoted-state (stream state current-token)
1880 (let ((data (html5-stream-char stream)))
1881 (cond ((eql data #\')
1882 (setf state :after-doctype-system-identifier-state))
1884 (push-token self '(:type :parse-error :data :invalid-codepoint))
1885 (add-to current-token :system-id #\uFFFD))
1887 (push-token self '(:type :parse-error :data :unexpected-end-of-doctype))
1888 (setf (getf current-token :correct) nil)
1889 (push-token self current-token)
1890 (setf state :data-state))
1892 (push-token self '(:type :parse-error :data :eof-in-doctype))
1893 (setf (getf current-token :correct) nil)
1894 (push-token self current-token)
1895 (setf state :data-state))
1897 (add-to current-token :system-id data)))))
1899 (defstate :after-doctype-system-identifier-state (stream state current-token)
1900 (let ((data (html5-stream-char stream)))
1901 (cond ((find data +space-characters+)
1905 (push-token self current-token)
1906 (setf state :data-state))
1908 (push-token self '(:type :parse-error :data :eof-in-doctype))
1909 (setf (getf current-token :correct) nil)
1910 (push-token self current-token)
1911 (setf state :data-state))
1913 (push-token self '(:type :parse-error :data :unexpected-char-in-doctype))
1914 (setf state :bogus-doctype-state)))))
1916 (defstate :bogus-doctype-state (stream state current-token)
1917 (let ((data (html5-stream-char stream)))
1918 (cond ((eql data #\>)
1919 (push-token self current-token)
1920 (setf state :data-state))
1923 (html5-stream-unget stream data)
1924 (push-token self current-token)
1925 (setf state :data-state))
1930 (defstate :cdata-section-state (stream state current-token)
1933 (push (html5-stream-chars-until stream '(#\])) data)
1934 (let ((char-stack '())
1936 (loop for expected across "]]>" do
1937 (push (html5-stream-char stream) char-stack)
1938 (cond ((eql (car char-stack) +eof+)
1940 (setf data (append char-stack data))
1942 ((not (eql (car char-stack) expected))
1944 (setf data (append char-stack data))
1948 (setf data (apply #'concatenate 'string (mapcar #'string (nreverse data))))
1949 ;; Deal with null here rather than in the parser
1950 (let ((null-count (count #\u0000 data)))
1951 (when (plusp null-count)
1952 (push-token self '(:type :parse-error :data :invalid-codepoint))
1953 (setf data (nsubstitute #\uFFFD #\u0000 data))))
1954 (when (plusp (length data))
1955 (push-token* self :characters data))
1956 (setf state :data-state)))
1959 ;; A basic implementation of a DOM-core like thing
1962 ((type :initform :node :allocation :class :reader node-type)
1963 (name :initarg :name :initform nil :reader node-name)
1964 (namespace :initarg :namespace :initform nil :reader node-namespace)
1965 (parent :initform nil :reader node-parent)
1966 (value :initform nil :initarg :value
1967 :accessor node-value)
1968 (child-nodes :initform nil :accessor %node-child-nodes)
1969 (last-child :initform nil :accessor last-child)))
1971 (defmethod (setf %node-child-nodes) :after (value (node node))
1972 (setf (last-child node) (last value)))
1974 (defclass document (node)
1975 ((type :initform :document :allocation :class)))
1977 (defclass document-fragment (document)
1978 ((type :initform :document-fragment :allocation :class)))
1980 (defclass document-type (node)
1981 ((type :initform :document-type :allocation :class)
1982 (public-id :initarg :public-id :reader node-public-id)
1983 (system-id :initarg :system-id :reader node-system-id)))
1985 (defclass text-node (node)
1986 ((type :initform :text :allocation :class)))
1988 (defclass element (node)
1989 ((type :initform :element :allocation :class)
1990 (attributes :initform nil :accessor %node-attributes)))
1992 (defclass comment-node (node)
1993 ((type :initform :comment :allocation :class)))
1999 (defun make-document ()
2000 (make-instance 'document))
2002 (defun make-fragment (document)
2003 (declare (ignore document))
2004 (make-instance 'document-fragment))
2006 (defun make-doctype (document name public-id system-id)
2007 (declare (ignore document))
2008 (make-instance 'document-type :name name :public-id public-id :system-id system-id))
2010 (defun make-comment (document data)
2011 (declare (ignore document))
2012 (make-instance 'comment-node :value data))
2014 (defun make-element (document name namespace)
2015 (declare (ignore document))
2016 (make-instance 'element :name name :namespace namespace))
2018 (defun make-text-node (document data)
2019 (declare (ignore document))
2020 (make-instance 'text-node :value data))
2026 (defun node-first-child (node)
2027 (car (%node-child-nodes node)))
2029 (defun node-last-child (node)
2030 (car (last-child node)))
2032 (defun node-previous-sibling (node)
2033 (loop for (this next) on (%node-child-nodes (node-parent node))
2034 when (eql next node) do (return this)))
2036 (defun node-next-sibling (node)
2037 (loop for (this next) on (%node-child-nodes (node-parent node))
2038 when (eql this node) do (return next)))
2040 (defun node-remove-child (node child)
2041 (setf (%node-child-nodes node)
2042 (remove child (%node-child-nodes node)))
2043 (setf (slot-value child 'parent) nil))
2045 (defun node-append-child (node child)
2046 (when (node-parent child)
2047 (node-remove-child (node-parent child) child))
2048 (setf (slot-value child 'parent) node)
2049 (if (%node-child-nodes node)
2050 (setf (last-child node)
2051 (push child (cdr (last-child node))))
2052 (setf (%node-child-nodes node)
2054 (%node-child-nodes node))
2056 (defun node-insert-before (node child insert-before)
2057 (let ((child-nodes (%node-child-nodes node)))
2058 (setf (slot-value child 'parent) node)
2059 (labels ((insert-before (child-nodes)
2060 (cond ((endp child-nodes)
2062 ((eql (car child-nodes) insert-before)
2063 (cons child child-nodes))
2064 (t (rplacd child-nodes (insert-before (cdr child-nodes)))))))
2065 (setf (%node-child-nodes node)
2066 (insert-before child-nodes)))))
2068 (defun element-attribute (node attribute &optional namespace)
2069 (cdr (assoc (cons attribute namespace)
2070 (%node-attributes node)
2073 (defun (setf element-attribute) (new-value node attribute
2074 &optional namespace)
2075 (check-type attribute string)
2076 (check-type new-value string)
2077 (let ((old-attr (assoc (cons attribute namespace)
2078 (%node-attributes node)
2081 (setf (cdr old-attr) new-value)
2082 (push (cons (cons attribute namespace) new-value) (%node-attributes node)))))
2088 (defun element-map-children (function node)
2089 (map nil function (%node-child-nodes node)))
2091 (defun element-map-attributes* (function node)
2092 (loop for ((name . namespace) . value) in (%node-attributes node)
2093 do (funcall function name namespace value)))
2095 (defun element-map-attributes (function node)
2096 (element-map-attributes*
2097 (lambda (name namespace value)
2100 (format nil "~A:~A" (find-prefix namespace) name)
2107 ;; Printing for the ease of debugging
2110 (defun node-count (tree)
2112 (element (1+ (apply #'+ (mapcar #'node-count (%node-child-nodes tree)))))
2113 ((or document document-fragment)
2114 (apply #'+ (mapcar #'node-count (%node-child-nodes tree))))
2117 (defmethod print-object ((node document) stream)
2118 (print-unreadable-object (node stream :type t :identity t)
2119 (format stream "nodes: ~A" (node-count node))))
2121 (defmethod print-object ((node node) stream)
2122 (print-unreadable-object (node stream :type t :identity t)
2123 (format stream "~A" (node-name node))))
2125 (defmethod print-object ((node text-node) stream)
2126 (print-unreadable-object (node stream :type t :identity t)
2127 (write (node-value node) :stream stream :length 30)))
2129 ;;; html5-parser-class
2132 (defclass html-parser ()
2133 ((html-namespace :initform (find-namespace "html"))
2134 (strict :initarg :strict)
2136 (container :initform "div")
2138 (document :initform (make-document))
2139 (errors :initform '())
2140 (phase :accessor parser-phase)
2147 (character-tokens :initform nil)
2150 active-formatting-elements
2154 (in-body-process-space-characters-mode :initform :non-pre)))
2157 (defmacro pop-end (place)
2158 "Pop from the end of list"
2159 (let ((old-list (gensym)))
2160 `(let ((,old-list ,place))
2161 (prog1 (car (last ,old-list))
2162 (setf ,place (butlast ,old-list))))))
2164 (defmacro push-end (object place)
2165 "Push to the end of list"
2167 ;(format t "~&push ~S to ~S" ',object ',place)
2168 (setf ,place (append ,place (list ,object)))))
2174 (slot-value *parser* 'document))
2176 (defun node-clone* (node)
2177 (ecase (node-type node)
2181 (make-fragment (document*)))
2183 (make-doctype (document*)
2185 (node-public-id node)
2186 (node-system-id node)))
2188 (make-comment (document*) (node-value node)))
2190 (make-text-node (document*) (node-value node)))
2192 (let ((clone (make-element (document*) (node-name node) (node-namespace node))))
2193 (element-map-attributes*
2194 (lambda (name namespace value)
2195 (setf (element-attribute clone name namespace) value))
2199 (defun node-name-tuple (node)
2200 (cons (or (node-namespace node)
2201 (find-namespace "html"))
2204 (defun node-name-tuple-values (node)
2205 (values (or (node-namespace node)
2206 (find-namespace "html"))
2209 (defun node-has-content (node)
2210 (not (null (node-first-child node))))
2212 (defun node-attributes= (node1 node2)
2213 (labels ((has-all-attributes-of (node1 node2)
2214 (element-map-attributes*
2215 (lambda (name namespace value)
2216 (unless (equal value
2217 (element-attribute node2 name namespace))
2218 (return-from has-all-attributes-of nil)))
2221 (and (has-all-attributes-of node1 node2)
2222 (has-all-attributes-of node2 node1))))
2224 (defun node-append-child* (node child)
2225 (let ((last-child (node-last-child node)))
2226 (if (and (eql :text (node-type child))
2228 (eql :text (node-type last-child)))
2229 (nconcatf (node-value last-child)
2231 (node-append-child node child))))
2233 (defun node-insert-before* (node child insert-before)
2234 (when (eql :text (node-type child))
2235 (let ((prev-child (node-previous-sibling insert-before)))
2236 (when (and prev-child
2237 (eql :text (node-type prev-child)))
2238 (node-remove-child node prev-child)
2239 (setf child (make-text-node
2241 (concatenate 'string
2242 (node-value prev-child)
2243 (node-value child)))))))
2244 (node-insert-before node child insert-before))
2246 (defun node-reparent-children (node new-parent)
2247 (element-map-children (lambda (child)
2248 (node-append-child new-parent child))
2251 (defun node-insert-text (node data &optional insert-before)
2253 (node-insert-before* node (make-text-node (document*) data) insert-before)
2254 (node-append-child* node (make-text-node (document*) data))))
2256 (defun last-open-element ()
2257 (with-slots (open-elements) *parser*
2258 (car (last open-elements))))
2260 (defun create-element (token)
2261 "Create an element but don't insert it anywhere"
2262 (with-slots (html-namespace) *parser*
2263 (let ((element (make-element (document*)
2265 (or (getf token :namespace)
2267 (loop for (name . value) in (getf token :data)
2269 (setf (element-attribute element (second name) (third name)) value)
2270 (setf (element-attribute element name) value)))
2274 (defun insert-root (token)
2275 (with-slots (open-elements) *parser*
2276 (let ((element (create-element token)))
2278 (push-end element open-elements)
2279 (node-append-child (document*) element))))
2281 (defun insert-doctype (token)
2282 (node-append-child (document*)
2283 (make-doctype (document*)
2285 (getf token :public-id)
2286 (getf token :system-id))))
2288 (defun insert-comment (token &optional parent)
2289 (with-slots (open-elements) *parser*
2291 (setf parent (car (last open-elements))))
2292 (node-append-child parent (make-comment (document*) (getf token :data)))))
2294 (defun insert-element-normal (token)
2295 (with-slots (open-elements) *parser*
2296 (let ((element (create-element token)))
2297 (node-append-child (last-open-element) element)
2298 (push-end element open-elements)
2301 (defun insert-element-table (token)
2302 (with-slots (open-elements) *parser*
2303 (if (not (member (node-name (last-open-element))
2304 +table-insert-mode-elements+ :test #'string=))
2305 (insert-element-normal token)
2306 (let ((element (create-element token)))
2307 ;; We should be in the InTable mode. This means we want to do
2308 ;; special magic element rearranging
2309 (multiple-value-bind (parent insert-before)
2310 (get-table-misnested-nodeposition)
2311 (if (not insert-before)
2312 (node-append-child* parent element)
2313 (node-insert-before* parent element insert-before))
2314 (push-end element open-elements))
2317 (defun insert-element (token)
2318 (with-slots (insert-from-table) *parser*
2319 (if insert-from-table
2320 (insert-element-table token)
2321 (insert-element-normal token))))
2323 (defun parser-insert-text (data &optional parent)
2325 (with-slots (open-elements insert-from-table) *parser*
2327 (setf parent (car (last open-elements))))
2328 (cond ((or (not insert-from-table)
2329 (and insert-from-table
2330 (not (member (node-name (last-open-element))
2331 +table-insert-mode-elements+ :test #'string=))))
2332 (node-insert-text parent data))
2334 ;; We should be in the InTable mode. This means we want to do
2335 ;; special magic element rearranging
2336 (multiple-value-bind (parent insert-before)
2337 (get-table-misnested-nodeposition)
2338 (node-insert-text parent data insert-before))))))
2340 (defun get-table-misnested-nodeposition ()
2341 "Get the foster parent element, and sibling to insert before
2342 (or None) when inserting a misnested table node"
2343 (with-slots (open-elements) *parser*
2344 ;; The foster parent element is the one which comes before the most
2345 ;; recently opened table element
2346 (let ((last-table (find "table" open-elements :key #'node-name :test #'string= :from-end t))
2348 (insert-before nil))
2351 ;; XXX - we should really check that this parent is actually a
2353 (if (node-parent last-table)
2354 (setf foster-parent (node-parent last-table)
2355 insert-before last-table)
2356 (setf foster-parent (elt open-elements (1- (position last-table open-elements))))))
2358 (setf foster-parent (first open-elements))))
2359 (values foster-parent insert-before))))
2361 (defun generate-implied-end-tags (&optional exclude)
2362 (with-slots (open-elements) *parser*
2363 (let ((name (node-name (last-open-element))))
2364 ;; XXX td, th and tr are not actually needed
2365 (when (and (member name '("dd" "dt" "li" "option" "optgroup" "p" "rp" "rt") :test #'string=)
2366 (not (equal name exclude)))
2367 (pop-end open-elements)
2368 ;; XXX This is not entirely what the specification says. We should
2369 ;; investigate it more closely.
2370 (generate-implied-end-tags exclude)))))
2372 (defun reconstruct-active-formatting-elements ()
2373 ;; Within this algorithm the order of steps described in the
2374 ;; specification is not quite the same as the order of steps in the
2375 ;; code. It should still do the same though.
2376 (with-slots (active-formatting-elements open-elements) *parser*
2378 ;; Step 1: stop the algorithm when there's nothing to do.
2379 (unless active-formatting-elements
2380 (return-from reconstruct-active-formatting-elements))
2382 ;; Step 2 and step 3: we start with the last element. So i is -1.
2383 (let* ((i (1- (length active-formatting-elements)))
2384 (entry (elt active-formatting-elements i)))
2385 (when (or (eql entry :marker)
2386 (member entry open-elements))
2387 (return-from reconstruct-active-formatting-elements))
2390 (loop while (and (not (eql entry :marker))
2391 (not (member entry open-elements))) do
2393 ;; This will be reset to 0 below
2397 ;; Step 5: let entry be one earlier in the list.
2398 (setf entry (elt active-formatting-elements i)))
2405 (setf entry (elt active-formatting-elements i))
2408 (let* ((element (insert-element (list :type :start-tag
2409 :name (node-name entry)
2410 :namespace (node-namespace entry)))))
2411 (element-map-attributes* (lambda (name namespace value)
2412 (setf (element-attribute element name namespace) value))
2416 (setf (elt active-formatting-elements i) element)
2419 (when (eql element (car (last active-formatting-elements)))
2422 (defun clear-active-formatting-elements ()
2423 (with-slots (active-formatting-elements) *parser*
2424 (loop for entry = (pop-end active-formatting-elements)
2425 while (and active-formatting-elements
2426 (not (eql entry :marker))))))
2428 (defun element-in-active-formatting-elements (name)
2429 "Check if an element exists between the end of the active
2430 formatting elements and the last marker. If it does, return it, else
2432 (with-slots (active-formatting-elements) *parser*
2433 (loop for item in (reverse active-formatting-elements) do
2434 ;; Check for Marker first because if it's a Marker it doesn't have a
2436 (when (eql item :marker)
2438 (when (string= (node-name item) name)
2441 (defun scope-tree ()
2443 (flet ((unflatten (alist)
2444 "Turn an alist into a tree."
2447 (remove-duplicates (mapcar #'car alist)
2449 (loop for (key . value) in alist
2450 do (push value (cdr (assoc key alist2
2452 ;; Put the XHTML ns first.
2455 (position (car pair)
2456 '("http://www.w3.org/1999/xhtml"
2457 "http://www.w3.org/2000/svg"
2458 "http://www.w3.org/1998/Math/MathML")
2459 :test #'string=))))))
2460 (let ((html (find-namespace "html")))
2461 `((nil . ,(unflatten +scoping-elements+))
2462 ("button" . ,(unflatten
2463 `(,@+scoping-elements+
2464 (,html . "button"))))
2465 ("list" . ,(unflatten
2466 `(,@+scoping-elements+
2469 ("table" . ((,html "html" "table")))
2470 ("select" . ((,html "optgroup" "option"))))))))
2472 (defun element-in-scope (target &optional variant)
2473 (let ((list-elements
2474 (cdr (assoc variant (scope-tree) :test #'equal)))
2475 (invert (equal "select" variant)))
2476 (dolist (node (reverse (slot-value *parser* 'open-elements)))
2477 (when (or (and (stringp target)
2478 (string= (node-name node) target))
2480 (return-from element-in-scope t))
2482 (multiple-value-bind (ns name)
2483 (node-name-tuple-values node)
2484 (let ((found (member name (cdr (assoc ns list-elements :test #'string=))
2487 (setf found (not found)))
2489 (return-from element-in-scope nil)))))
2491 (error "We should never reach this point")))
2494 ;; external interface
2495 (defun parse-html5 (source &key encoding strictp container dom)
2496 (parse-html5-from-source source
2499 :container container
2502 (defun parse-html5-fragment (source &key encoding strictp (container "div") dom)
2503 (parse-html5-from-source source
2506 :container container
2509 (defgeneric transform-html5-dom (to-type node &key)
2510 (:method ((to-type cons) node &key)
2511 (apply #'transform-html5-dom (car to-type) node (cdr to-type)))
2512 (:method (to-type node &key &allow-other-keys)
2513 (error "No TRANSFORM-HTML5-DOM method defined for dom type ~S." to-type)))
2518 (defun parse-html5-from-source (source &key container encoding strictp dom)
2519 (let ((*parser* (make-instance 'html-parser
2521 (parser-parse source
2522 :fragment-p container
2524 (with-slots (open-elements errors) *parser*
2527 (let ((fragment (make-fragment (document*))))
2528 (node-reparent-children (first open-elements) fragment)
2532 (transform-html5-dom dom document)
2534 (reverse errors))))))
2538 (defun ascii-ichar= (char1 char2)
2539 "ASCII case-insensitive char="
2540 (or (char= char1 char2)
2541 (and (or (char<= #\A char1 #\Z)
2542 (char<= #\A char2 #\Z))
2543 (char= (char-downcase char1)
2544 (char-downcase char2)))))
2546 (defun ascii-istring= (string1 string2)
2547 "ASCII case-insensitive string="
2548 (every #'ascii-ichar= string1 string2))
2550 (defun cdata-switch-helper ()
2551 (and (last-open-element)
2552 (not (equal (node-namespace (last-open-element))
2553 (slot-value *parser* 'html-namespace)))))
2555 (defun parser-parse (source &key fragment-p encoding)
2556 (with-slots (inner-html-mode container tokenizer)
2558 (setf inner-html-mode fragment-p)
2559 (when (stringp fragment-p)
2560 (setf container fragment-p))
2561 (setf tokenizer (make-html-tokenizer source
2564 :cdata-switch-helper #'cdata-switch-helper))
2567 ;; The input stream will throw please-reparse with result true
2568 ;; if the encoding is changed
2569 while (catch 'please-reparse
2572 do (parser-reset))))
2574 (defun parser-reset ()
2575 (with-slots (open-elements active-formatting-elements
2576 head-pointer form-pointer insert-from-table
2577 first-start-tag errors compat-mode inner-html-mode
2578 inner-html container tokenizer phase last-phase
2579 before-rcdata-phase frameset-ok
2582 (setf open-elements '())
2583 (setf active-formatting-elements '())
2584 (setf head-pointer nil)
2585 (setf form-pointer nil)
2586 (setf insert-from-table nil)
2587 (setf first-start-tag nil)
2589 (setf compat-mode :no-quirks)
2590 (cond (inner-html-mode
2591 (setf inner-html (string-downcase container))
2592 (cond ((member inner-html +cdata-elements+ :test #'string=)
2593 (setf (slot-value tokenizer 'state) :rcdata-state))
2594 ((member inner-html +rcdata-elements+ :test #'string=)
2595 (setf (slot-value tokenizer 'state) :rawtext-state))
2596 ((string= inner-html "plaintext")
2597 (setf (slot-value tokenizer 'state) :plaintext-state)))
2598 (insert-root (implied-tag-token "html" :start-tag))
2599 (setf phase :before-head)
2600 (reset-insertion-mode))
2602 (setf inner-html nil)
2603 (setf phase :initial)))
2605 (setf last-phase nil)
2606 (setf before-rcdata-phase nil)
2607 (setf frameset-ok t)))
2609 (defun is-html-integration-point (element)
2610 (if (and (string= (node-name element) "annotation-xml")
2611 (string= (node-namespace element) (find-namespace "mathml")))
2612 (and (element-attribute element "encoding")
2613 (member (ascii-upper-2-lower (element-attribute element "encoding"))
2614 '("text/html" "application/xhtml+xml")
2616 (member (node-name-tuple element)
2617 +html-integration-point-elements+
2620 (defun is-math-ml-text-integration-point (element)
2621 (member (node-name-tuple element)
2622 +mathml-text-integration-point-elements+
2626 (with-slots (tokenizer phase)
2628 (map-tokens tokenizer (lambda (token)
2629 (process-token (normalize-token token))))
2630 (loop with reprocess = t
2634 (setf reprocess (process-eof nil :phase phase))
2636 (assert (not (member phase phases)))))))
2638 (defun process-token (token)
2639 (with-slots (tokenizer last-open-element html-namespace)
2641 (let ((new-token token)
2643 (loop while new-token do
2644 (let* ((current-node (last-open-element))
2645 (current-node-namespace (if current-node (node-namespace current-node)))
2646 (current-node-name (if current-node (node-name current-node))))
2648 (setf type (getf new-token :type))
2650 (cond ((eql type :parse-error)
2651 (parser-parse-error (getf token :data) (getf token :datavars))
2652 (setf new-token nil))
2655 (if (or (null (slot-value *parser* 'open-elements))
2656 (equal current-node-namespace html-namespace)
2657 (and (is-math-ml-text-integration-point current-node)
2658 (or (and (eql type :start-tag)
2659 (not (member (getf token :name) '("mglyph" "malignmark") :test #'string=)))
2660 (eql type :characters)
2661 (eql type :space-characters)))
2662 (and (equal current-node-namespace (find-namespace "mathml"))
2663 (equal current-node-name "annotation-xml")
2664 (eql type :start-tag)
2665 (equal (getf token :name) "svg"))
2666 (and (is-html-integration-point current-node)
2667 (member type '(:start-tag :characters :space-characters))))
2668 (setf phase (slot-value *parser* 'phase))
2669 (setf phase :in-foreign-content))
2670 ;(format t "~&phase ~S token ~S~%" phase new-token)
2674 (process-characters new-token :phase phase))
2676 (process-space-characters new-token :phase phase))
2678 (process-start-tag new-token :phase phase))
2680 (process-end-tag new-token :phase phase))
2682 (process-comment new-token :phase phase))
2684 (process-doctype new-token :phase phase))))
2685 ;(format t " phase returned ~S new-token ~S~%" phase new-token)
2687 (when (and (eql type :start-tag)
2688 (getf token :self-closing)
2689 (not (getf token :self-closing-acknowledged)))
2690 (parser-parse-error :non-void-element-with-trailing-solidus
2691 `(:name ,(getf token :name))))))))
2693 (defun parser-parse-error (error-code &optional datavars)
2694 (with-slots (errors) *parser*
2695 (push (list error-code datavars) errors)))
2697 ;; TODO rename to a longer and more descriptive name when we are done writing the code
2698 (defun perror (error-code &rest datavars)
2699 (parser-parse-error error-code datavars))
2701 (defun normalize-token (token)
2702 (when (getf token :start-tag)
2703 ;; Remove duplicate attributes
2704 (setf (getf token :data) (remove-duplicates (getf token :data)
2710 (defun adjust-attributes (token replacements)
2711 (setf (getf token :data)
2712 (loop for (name . value) in (getf token :data)
2713 collect (cons (or (cdr (assoc name replacements :test #'string=))
2717 (defun adjust-math-ml-attributes (token)
2718 (adjust-attributes token '(("definitionurl" ."definitionURL"))))
2720 (defun adjust-svg-attributes (token)
2721 (adjust-attributes token '(("attributename" . "attributeName")
2722 ("attributetype" . "attributeType")
2723 ("basefrequency" . "baseFrequency")
2724 ("baseprofile" . "baseProfile")
2725 ("calcmode" . "calcMode")
2726 ("clippathunits" . "clipPathUnits")
2727 ("contentscripttype" . "contentScriptType")
2728 ("contentstyletype" . "contentStyleType")
2729 ("diffuseconstant" . "diffuseConstant")
2730 ("edgemode" . "edgeMode")
2731 ("externalresourcesrequired" . "externalResourcesRequired")
2732 ("filterres" . "filterRes")
2733 ("filterunits" . "filterUnits")
2734 ("glyphref" . "glyphRef")
2735 ("gradienttransform" . "gradientTransform")
2736 ("gradientunits" . "gradientUnits")
2737 ("kernelmatrix" . "kernelMatrix")
2738 ("kernelunitlength" . "kernelUnitLength")
2739 ("keypoints" . "keyPoints")
2740 ("keysplines" . "keySplines")
2741 ("keytimes" . "keyTimes")
2742 ("lengthadjust" . "lengthAdjust")
2743 ("limitingconeangle" . "limitingConeAngle")
2744 ("markerheight" . "markerHeight")
2745 ("markerunits" . "markerUnits")
2746 ("markerwidth" . "markerWidth")
2747 ("maskcontentunits" . "maskContentUnits")
2748 ("maskunits" . "maskUnits")
2749 ("numoctaves" . "numOctaves")
2750 ("pathlength" . "pathLength")
2751 ("patterncontentunits" . "patternContentUnits")
2752 ("patterntransform" . "patternTransform")
2753 ("patternunits" . "patternUnits")
2754 ("pointsatx" . "pointsAtX")
2755 ("pointsaty" . "pointsAtY")
2756 ("pointsatz" . "pointsAtZ")
2757 ("preservealpha" . "preserveAlpha")
2758 ("preserveaspectratio" . "preserveAspectRatio")
2759 ("primitiveunits" . "primitiveUnits")
2762 ("repeatcount" . "repeatCount")
2763 ("repeatdur" . "repeatDur")
2764 ("requiredextensions" . "requiredExtensions")
2765 ("requiredfeatures" . "requiredFeatures")
2766 ("specularconstant" . "specularConstant")
2767 ("specularexponent" . "specularExponent")
2768 ("spreadmethod" . "spreadMethod")
2769 ("startoffset" . "startOffset")
2770 ("stddeviation" . "stdDeviation")
2771 ("stitchtiles" . "stitchTiles")
2772 ("surfacescale" . "surfaceScale")
2773 ("systemlanguage" . "systemLanguage")
2774 ("tablevalues" . "tableValues")
2775 ("targetx" . "targetX")
2776 ("targety" . "targetY")
2777 ("textlength" . "textLength")
2778 ("viewbox" . "viewBox")
2779 ("viewtarget" . "viewTarget")
2780 ("xchannelselector" . "xChannelSelector")
2781 ("ychannelselector" . "yChannelSelector")
2782 ("zoomandpan" . "zoomAndPan"))))
2784 (defun adjust-foreign-attributes (token)
2785 (adjust-attributes token `(("xlink:actuate" . ("xlink" "actuate" ,(find-namespace "xlink")))
2786 ("xlink:arcrole" . ("xlink" "arcrole" ,(find-namespace "xlink")))
2787 ("xlink:href" . ("xlink" "href" ,(find-namespace "xlink")))
2788 ("xlink:role" . ("xlink" "role" ,(find-namespace "xlink")))
2789 ("xlink:show" . ("xlink" "show" ,(find-namespace "xlink")))
2790 ("xlink:title" . ("xlink" "title" ,(find-namespace "xlink")))
2791 ("xlink:type" . ("xlink" "type" ,(find-namespace "xlink")))
2792 ("xml:base" . ("xml" "base" ,(find-namespace "xml")))
2793 ("xml:lang" . ("xml" "lang" ,(find-namespace "xml")))
2794 ("xml:space" . ("xml" "space" ,(find-namespace "xml")))
2795 ("xmlns" . (nil "xmlns" ,(find-namespace "xmlns")))
2796 ("xmlns:xlink" . ("xmlns" "xlink" ,(find-namespace "xmlns"))))))
2798 (defun reset-insertion-mode ()
2799 (with-slots (inner-html html-namespace phase open-elements) *parser*
2802 (new-modes '(("select" . :in-select)
2806 ("tbody" . :in-table-body)
2807 ("thead" . :in-table-body)
2808 ("tfoot" . :in-table-body)
2809 ("caption" . :in-caption)
2810 ("colgroup" . :in-column-group)
2811 ("table" . :in-table)
2814 ("frameset" . :in-frameset)
2815 ("html" . :before-head))))
2816 (loop for node in (reverse open-elements)
2817 for node-name = (node-name node)
2819 (when (eql node (first open-elements))
2822 (setf node-name inner-html))
2823 ;; Check for conditions that should only happen in the innerHTML
2825 (when (member node-name '("select" "colgroup" "head" "html") :test #'string=)
2826 (assert inner-html))
2828 (unless (and (not last)
2829 (string/= (node-namespace node) html-namespace))
2830 (let ((match (cdr (assoc node-name new-modes :test #'string=))))
2832 (setf new-phase match)
2835 (setf new-phase :in-body)
2837 (setf phase new-phase))))
2839 (defun parse-rc-data-raw-text (token content-type)
2840 (assert (member content-type '(:rawtext :rcdata)))
2841 (with-slots (tokenizer original-phase phase) *parser*
2842 (insert-element token)
2843 (setf (tokenizer-state tokenizer) (ecase content-type
2844 (:rawtext :rawtext-state)
2845 (:rcdata :rcdata-state)))
2846 (setf original-phase phase)
2851 ;; Phases --------------------------------------------------------------------
2853 (defun implied-tag-token (name &optional (type :end-tag))
2854 (list :type type :name name :data '() :self-closing nil))
2856 (defun implied-tag-token/full (name type
2857 &key (attributes '()) (self-closing nil))
2858 (list :type type :name name :data attributes :self-closing self-closing))
2860 (eval-when (:compile-toplevel :execute)
2861 (defun phase-process-method-name (function-name)
2862 (intern (concatenate 'string
2864 (symbol-name function-name))
2865 (symbol-package function-name))))
2867 (defvar *phase-indent* 0)
2869 (defun call-phase-method (name phase token)
2870 ;(format *trace-output* "~&~vTcall: ~S ~S ~S" *phase-indent* name phase token)
2872 (let ((result (let ((*phase-indent* (+ 4 *phase-indent*)))
2873 (funcall name phase token))))
2874 ;(format *trace-output* "~&~vTreturn: ~S ~S" *phase-indent* name result)
2877 (defmacro define-phase-process-functions (&body defs)
2879 ,@(loop for function-name in defs
2880 for method-name = (phase-process-method-name function-name)
2881 collect `(defgeneric ,method-name (phase token))
2882 collect `(defun ,function-name (token &key (phase *phase*))
2883 (call-phase-method #',method-name phase token)))))
2885 (define-phase-process-functions
2886 add-formatting-element
2887 end-tag-applet-marquee-object
2900 end-tag-html-body-br
2913 end-tag-table-row-group
2921 process-space-characters
2924 start-tag-applet-marquee-object
2925 start-tag-base-link-command
2933 start-tag-formatting
2943 start-tag-imply-tbody
2950 start-tag-no-script-no-frames-style
2957 start-tag-param-source
2959 start-tag-pre-listing
2960 start-tag-process-in-head
2966 start-tag-style-script
2969 start-tag-table-cell
2970 start-tag-table-element
2971 start-tag-table-other
2975 start-tag-void-formatting
2978 (defmacro def (phase name (&rest slots) &body body)
2979 `(defmethod ,(phase-process-method-name name) ((*phase* (eql ,phase)) token)
2980 (with-slots (,@slots) *parser*
2983 (defmacro tagname-dispatch (phase name &body cases)
2984 `(def ,phase ,name ()
2985 (let ((tagname (getf token :name)))
2986 (declare (ignorable tagname))
2987 ,(let* ((default '(error "Unhandled tag ~S" tagname))
2989 (loop for (tagnames function) in cases
2990 append (cond ((stringp tagnames)
2991 `((,tagnames (,function token))))
2993 (loop for tag in tagnames
2994 collect `(,tag (,function token))))
2995 ((eql 'default tagnames)
2996 (setf default `(,function token))
2998 (t (error "Invalid tag name clause ~S" tagnames))))))
2999 (if (not string-cases)
3002 (tagname :default ,default)
3003 ,@string-cases))))))
3007 (defmethod %process-comment (*phase* token)
3008 ;; For most phases the following is correct. Where it's not it will be
3010 (insert-comment token (last-open-element))
3013 (defmethod %process-doctype (*phase* token)
3014 (parser-parse-error :unexpected-doctype)
3017 (defmethod %process-characters (*phase* token)
3018 (parser-insert-text (getf token :data))
3021 (defmethod %process-space-characters (*phase* token)
3022 (parser-insert-text (getf token :data))
3025 (defmethod %start-tag-html (*phase* token)
3026 (with-slots (first-start-tag open-elements)
3028 (when (and (not first-start-tag)
3029 (string= (getf token :name) "html"))
3030 (parser-parse-error :non-html-root))
3031 ;; XXX Need a check here to see if the first start tag token emitted is
3032 ;; this token... If it's not, invoke self.parser.parseError().
3033 (let ((root-element (first open-elements)))
3034 (loop for (name . value) in (getf token :data)
3035 do (unless (element-attribute root-element name)
3036 (setf (element-attribute root-element name) value))))
3037 (setf first-start-tag nil)
3043 (def :initial process-space-characters ()
3046 (def :initial process-comment ()
3047 (insert-comment token (document*))
3050 (def :initial process-doctype (compat-mode phase)
3051 (destructuring-bind (&key name public-id system-id correct &allow-other-keys)
3054 (when (or (string/= name "html")
3056 (and system-id (string/= system-id "about:legacy-compat")))
3057 (parser-parse-error :unknown-doctype))
3060 (setf public-id ""))
3062 (insert-doctype token)
3064 (setf public-id (ascii-upper-2-lower public-id))
3066 (cond ((or (not correct)
3067 (string/= name "html")
3068 (cl-ppcre:scan +quirks-mode-doctypes-regexp+ public-id)
3069 (member public-id '("-//w3o//dtd w3 html strict 3.0//en//"
3070 "-/w3c/dtd html 4.0 transitional/en"
3073 (and (not system-id)
3074 (cl-ppcre:scan '(:sequence :start-anchor (:alternation
3075 "-//w3c//dtd html 4.01 frameset//"
3076 "-//w3c//dtd html 4.01 transitional//"))
3079 (equal (ascii-upper-2-lower system-id)
3080 "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")))
3081 (setf compat-mode :quirks))
3082 ((or (cl-ppcre:scan '(:sequence :start-anchor (:alternation
3083 "-//w3c//dtd xhtml 1.0 frameset//"
3084 "-//w3c//dtd xhtml 1.0 transitional//"))
3087 (cl-ppcre:scan '(:sequence :start-anchor (:alternation
3088 "-//w3c//dtd html 4.01 frameset//"
3089 "-//w3c//dtd html 4.01 transitional//"))
3091 (setf compat-mode :limited-quirks)))
3092 (setf phase :before-html)
3095 (flet ((anything-else ()
3096 (with-slots (compat-mode phase)
3098 (setf compat-mode :quirks)
3099 (setf phase :before-html))))
3101 (def :initial process-characters ()
3102 (parser-parse-error :expected-doctype-but-got-chars)
3106 (def :initial process-start-tag ()
3107 (parser-parse-error :expected-doctype-but-got-start-tag
3108 (list :name (getf token :name)))
3112 (def :initial process-end-tag ()
3113 (parser-parse-error :expected-doctype-but-got-end-tag
3114 (list :name (getf token :name)))
3118 (def :initial process-eof ()
3119 (parser-parse-error :expected-doctype-but-got-eof)
3126 (flet ((insert-html-element ()
3127 (insert-root (implied-tag-token "html" :start-tag))
3128 (setf (parser-phase *parser*) :before-head)))
3131 (def :before-html process-eof ()
3132 (insert-html-element)
3135 (def :before-html process-comment ()
3136 (insert-comment token (document*))
3139 (def :before-html process-space-characters ()
3142 (def :before-html process-characters ()
3143 (insert-html-element)
3146 (def :before-html process-start-tag (first-start-tag)
3147 (when (string= (getf token :name) "html")
3148 (setf first-start-tag t))
3149 (insert-html-element)
3152 (def :before-html process-end-tag ()
3153 (cond ((not (member (getf token :name) '("head" "body" "html" "br") :test #'string=))
3154 (parser-parse-error :unexpected-end-tag-before-html `(:name ,(getf token :name)))
3157 (insert-html-element)
3162 (tagname-dispatch :before-head process-start-tag
3163 ("html" start-tag-html)
3164 ("head" start-tag-head token)
3165 (default start-tag-other))
3167 (tagname-dispatch :before-head process-end-tag
3168 (("head" "body" "html" "br") end-tag-imply-head)
3169 (default end-tag-other))
3171 (def :before-head process-eof ()
3172 (start-tag-head (implied-tag-token "head" :start-tag))
3175 (def :before-head process-space-characters ()
3178 (def :before-head process-characters ()
3179 (start-tag-head (implied-tag-token "head" :start-tag))
3182 (def :before-head start-tag-html ()
3183 (process-start-tag token :phase :in-body))
3185 (def :before-head start-tag-head (head-pointer)
3186 (insert-element token)
3187 (setf head-pointer (last-open-element))
3188 (setf (parser-phase *parser*) :in-head)
3191 (def :before-head start-tag-other ()
3192 (start-tag-head (implied-tag-token "head" :start-tag))
3195 (def :before-head end-tag-imply-head ()
3196 (start-tag-head (implied-tag-token "head" :start-tag))
3199 (def :before-head end-tag-other ()
3200 (parser-parse-error :end-tag-after-implied-root `(:name ,(getf token :name)))
3205 (tagname-dispatch :in-head process-start-tag
3206 ("html" start-tag-html)
3207 ("title" start-tag-title)
3208 (("noscript" "noframes" "style") start-tag-no-script-no-frames-style)
3209 ("script" start-tag-script)
3210 (("base" "basefont" "bgsound" "command" "link") start-tag-base-link-command)
3211 ("meta" start-tag-meta)
3212 ("head" start-tag-head)
3213 (default start-tag-other))
3215 (tagname-dispatch :in-head process-end-tag
3216 ("head" end-tag-head)
3217 (("br" "html" "body") end-tag-html-body-br)
3218 (default end-tag-other))
3220 (flet ((anything-else ()
3221 (end-tag-head (implied-tag-token "head"))))
3224 (def :in-head process-eof ()
3228 (def :in-head process-characters ()
3232 (def :in-head start-tag-html ()
3233 (process-start-tag token :phase :in-body))
3235 (def :in-head start-tag-head ()
3236 (parser-parse-error :two-heads-are-not-better-than-one)
3239 (def :in-head start-tag-base-link-command (open-elements)
3240 (insert-element token)
3241 (pop-end open-elements)
3242 (setf (getf token :self-closing-acknowledged) t)
3245 (defun parse-content-attr (string)
3246 "The algorithm for extracting an encoding from a meta element"
3247 (let ((position 0)) ; Step 1
3248 (labels ((char-at (index)
3249 (and (< position (length string))
3250 (char string index)))
3252 (loop while (member (char-at position) +space-characters+)
3253 do (incf position))))
3256 (setf position (search "charset" string :start2 position))
3258 (return-from parse-content-attr))
3259 ;; Set position to after charset
3264 (when (eql (char-at position) #\=)
3271 (let ((next-char (char-at position)))
3272 (cond ((or (eql #\' next-char)
3273 (eql #\" next-char))
3275 (let ((end (position next-char string :start position)))
3277 (subseq string position end))))
3279 (let ((start position))
3280 (loop until (or (= position (length string))
3281 (member (char-at position) +space-characters+))
3283 (subseq string start position))))))))
3286 (def :in-head start-tag-meta (tokenizer open-elements)
3287 (insert-element token)
3288 (pop-end open-elements)
3289 (setf (getf token :self-closing-acknowledged) t)
3291 (let ((attributes (getf token :data)))
3292 (when (eql (cdr (html5-stream-encoding (tokenizer-stream tokenizer))) :tentative)
3293 (cond ((assoc "charset" attributes :test #'string=)
3294 (html5-stream-change-encoding (tokenizer-stream tokenizer)
3295 (cdr (assoc "charset" attributes :test #'string=))))
3296 ((and (assoc "http-equiv" attributes :test #'string=)
3297 (ascii-istring= (cdr (assoc "http-equiv" attributes :test #'string=))
3299 (assoc "content" attributes :test #'string=))
3300 (let* ((content (cdr (assoc "content" attributes :test #'string=)))
3301 (new-encoding (parse-content-attr content)))
3303 (html5-stream-change-encoding (tokenizer-stream tokenizer)
3305 (parser-parse-error :invalid-encoding-declaration
3306 `(:content ,content))))))))
3309 (def :in-head start-tag-title ()
3310 (parse-rc-data-raw-text token :rcdata)
3313 (def :in-head start-tag-no-script-no-frames-style ()
3314 ;; Need to decide whether to implement the scripting-disabled case
3315 (parse-rc-data-raw-text token :rawtext))
3317 (def :in-head start-tag-script (tokenizer original-phase phase)
3318 (insert-element token)
3319 (setf (tokenizer-state tokenizer) :script-data-state)
3320 (setf original-phase phase)
3324 (def :in-head start-tag-other ()
3328 (def :in-head end-tag-head (phase open-elements)
3329 (let ((node (pop-end open-elements)))
3330 (assert (string= (node-name node) "head") () "Expected head got ~S" (node-name node))
3331 (setf phase :after-head)
3334 (def :in-head end-tag-html-body-br ()
3338 (def :in-head end-tag-other ()
3339 (parser-parse-error :unexpected-end-tag `(:name ,(getf token :name)))
3342 ;; XXX If we implement a parser for which scripting is disabled we need to
3343 ;; implement this phase.
3345 ;; InHeadNoScriptPhase
3349 (tagname-dispatch :after-head process-start-tag
3350 ("html" start-tag-html)
3351 ("body" start-tag-body)
3352 ("frameset" start-tag-frameset)
3353 (("base" "basefont" "bgsound" "link" "meta"
3354 "noframes" "script" "style" "title")
3355 start-tag-from-head)
3356 ("head" start-tag-head)
3357 (default start-tag-other))
3359 (tagname-dispatch :after-head process-end-tag
3360 (("body" "html" "br") end-tag-html-body-br)
3361 (default end-tag-other))
3363 (flet ((anything-else ()
3364 (with-slots (phase frameset-ok) *parser*
3365 (insert-element (implied-tag-token "body" :start-tag))
3366 (setf phase :in-body)
3367 (setf frameset-ok t))))
3369 (def :after-head process-eof ()
3373 (def :after-head process-characters ()
3377 (def :after-head start-tag-html ()
3378 (process-start-tag token :phase :in-body))
3380 (def :after-head start-tag-body (phase frameset-ok)
3381 (setf frameset-ok nil)
3382 (insert-element token)
3383 (setf phase :in-body)
3386 (def :after-head start-tag-frameset (phase)
3387 (insert-element token)
3388 (setf phase :in-frameset)
3391 (def :after-head start-tag-from-head (head-pointer open-elements)
3392 (parser-parse-error :unexpected-start-tag-out-of-my-head
3393 `(:name ,(getf token :name)))
3394 (push-end head-pointer open-elements)
3395 (process-start-tag token :phase :in-head)
3396 (loop for node in (reverse open-elements)
3397 do (when (string= "head" (node-name node))
3399 (remove node open-elements :test #'equal))
3403 (def :after-head start-tag-head ()
3404 (parser-parse-error :unexpected-start-tag
3405 `(:name ,(getf token :name)))
3408 (def :after-head start-tag-other ()
3412 (def :after-head end-tag-html-body-br ()
3416 (def :after-head end-tag-other ()
3417 (parser-parse-error :unexpected-end-tag
3418 `(:name ,(getf token :name)))
3423 (tagname-dispatch :in-body process-start-tag
3424 ("html" start-tag-html)
3425 (("base" "basefont" "bgsound" "command" "link"
3426 "meta" "noframes" "script" "style" "title")
3427 start-tag-process-in-head)
3428 ("body" start-tag-body)
3429 ("frameset" start-tag-frameset)
3430 (("address" "article" "aside" "blockquote" "center" "details"
3431 "dir" "div" "dl" "fieldset" "figcaption" "figure"
3432 "footer" "header" "hgroup" "menu" "nav" "ol" "p"
3433 "section" "summary" "ul")
3435 (#.+heading-elements+ start-tag-heading)
3436 (("pre" "listing") start-tag-pre-listing)
3437 ("form" start-tag-form)
3438 (("li" "dd" "dt") start-tag-list-item)
3439 ("plaintext" start-tag-plaintext)
3441 (("b" "big" "code" "em" "font" "i" "s" "small" "strike"
3443 start-tag-formatting)
3444 ("nobr" start-tag-nobr)
3445 ("button" start-tag-button)
3446 (("applet" "marquee" "object") start-tag-applet-marquee-object)
3447 ("xmp" start-tag-xmp)
3448 ("table" start-tag-table)
3449 (("area" "br" "embed" "img" "keygen" "wbr")
3450 start-tag-void-formatting)
3451 (("param" "source" "track") start-tag-param-source)
3452 ("input" start-tag-input)
3454 ("image" start-tag-image)
3455 ("isindex" start-tag-is-index)
3456 ("textarea" start-tag-textarea)
3457 ("iframe" start-tag-i-frame)
3458 (("noembed" "noscript") start-tag-rawtext)
3459 ("select" start-tag-select)
3460 (("rp" "rt") start-tag-rp-rt)
3461 (("option" "optgroup") start-tag-opt)
3462 (("math") start-tag-math)
3463 (("svg") start-tag-svg)
3464 (("caption" "col" "colgroup" "frame" "head"
3465 "tbody" "td" "tfoot" "th" "thead"
3467 start-tag-misplaced)
3468 (default start-tag-other))
3470 (tagname-dispatch :in-body process-end-tag
3471 ("body" end-tag-body)
3472 ("html" end-tag-html)
3473 (("address" "article" "aside" "blockquote" "button" "center"
3474 "details" "dir" "div" "dl" "fieldset" "figcaption" "figure"
3475 "footer" "header" "hgroup" "listing" "menu" "nav" "ol" "pre"
3476 "section" "summary" "ul")
3478 ("form" end-tag-form)
3480 (("dd" "dt" "li") end-tag-list-item)
3481 (#.+heading-elements+ end-tag-heading)
3482 (("a" "b" "big" "code" "em" "font" "i" "nobr" "s" "small"
3483 "strike" "strong" "tt" "u")
3485 (("applet" "marquee" "object") end-tag-applet-marquee-object)
3487 (default end-tag-other))
3489 (flet ((is-matching-formatting-element (node1 node2)
3490 (and (equal (node-name node1) (node-name node2))
3491 (equal (node-namespace node1) (node-namespace node2))
3492 (node-attributes= node1 node2))))
3494 (def :in-body add-formatting-element (reverse active-formatting-elements)
3495 (insert-element token)
3496 (let ((element (last-open-element))
3498 (loop for node in (reverse active-formatting-elements)
3499 do (if (eq node :marker)
3501 (when (is-matching-formatting-element node element)
3502 (push-end node matching-elements))))
3503 (assert (<= (length matching-elements) 3))
3504 (when (= (length matching-elements) 3)
3505 (setf active-formatting-elements
3506 (remove (car (last matching-elements))
3507 active-formatting-elements)))
3509 (push-end element active-formatting-elements))
3512 (def :in-body process-eof (open-elements)
3513 (let ((allowed-elements '("dd" "dt" "li" "p" "tbody" "td"
3514 "tfoot" "th" "thead" "tr" "body" "html")))
3515 (loop for node in (reverse open-elements)
3516 do (when (not (member (node-name node)
3519 (parser-parse-error :expected-closing-tag-but-got-eof)
3523 (def :in-body process-characters (frameset-ok)
3524 (let ((data (getf token :data)))
3525 (if (equal data (string #\u0000))
3528 (reconstruct-active-formatting-elements)
3529 (parser-insert-text data)
3530 ;;This must be bad for performance
3531 (when (and frameset-ok
3532 (notevery (lambda (char)
3533 (find char +space-characters+))
3535 (setf frameset-ok nil))
3538 (def :in-body process-space-characters (in-body-process-space-characters-mode)
3539 (ecase in-body-process-space-characters-mode
3541 (reconstruct-active-formatting-elements)
3542 (parser-insert-text (getf token :data)))
3544 (let ((data (getf token :data)))
3545 (setf in-body-process-space-characters-mode :non-pre)
3546 (when (and (plusp (length data))
3547 (char= #\Newline (char data 0))
3548 (member (node-name (last-open-element))
3549 '("pre" "listing" "textarea")
3551 (not (node-has-content (last-open-element))))
3552 (setf data (subseq data 1)))
3553 (when (plusp (length data))
3554 (reconstruct-active-formatting-elements)
3555 (parser-insert-text data)))))
3558 (def :in-body start-tag-process-in-head ()
3559 (process-start-tag token :phase :in-head))
3561 (def :in-body start-tag-body (frameset-ok open-elements)
3562 (parser-parse-error :unexpected-start-tag
3563 `(:name ,(getf token :name)))
3564 (if (or (= 1 (length open-elements))
3565 (string/= (node-name (second open-elements)) "body"))
3566 (assert (slot-value *parser* 'inner-html))
3568 (setf frameset-ok nil)
3569 (loop for (name . value) in (getf token :data)
3570 do (unless (element-attribute (second open-elements) name)
3571 (setf (element-attribute (second open-elements) name) value)))))
3574 (def :in-body start-tag-frameset (frameset-ok phase open-elements)
3575 (parser-parse-error :unexpected-start-tag
3576 `(:name ,(getf token :name)))
3577 (cond ((or (= 1 (length open-elements))
3578 (string/= (node-name (second open-elements)) "body"))
3579 (assert (slot-value *parser* 'inner-html)))
3583 (when (node-parent (second open-elements))
3584 (node-remove-child (node-parent (second open-elements))
3585 (second open-elements)))
3586 (loop until (string= (node-name (last-open-element))
3588 do (pop-end open-elements))
3589 (insert-element token)
3590 (setf phase :in-frameset)))
3593 (def :in-body start-tag-close-p ()
3594 (when (element-in-scope "p" "button")
3595 (end-tag-p (implied-tag-token "p")))
3596 (insert-element token)
3599 (def :in-body start-tag-pre-listing (in-body-process-space-characters-mode frameset-ok)
3600 (when (element-in-scope "p" "button")
3601 (end-tag-p (implied-tag-token "p")))
3602 (insert-element token)
3603 (setf frameset-ok nil)
3604 (setf in-body-process-space-characters-mode :drop-newline)
3607 (def :in-body start-tag-form (form-pointer)
3609 (parser-parse-error :unexpected-start-tag
3610 `(:name ,(getf token :name)))
3612 (when (element-in-scope "p" "button")
3613 (end-tag-p (implied-tag-token "p")))
3614 (insert-element token)
3615 (setf form-pointer (last-open-element))))
3618 (def :in-body start-tag-list-item (phase frameset-ok open-elements)
3619 (setf frameset-ok nil)
3620 (let ((stop-names (cond ((string= (getf token :name) "li")
3622 ((string= (getf token :name) "dt")
3624 ((string= (getf token :name) "dd")
3626 (loop for node in (reverse open-elements)
3627 do (cond ((member (node-name node) stop-names :test #'string=)
3628 (process-end-tag (implied-tag-token (node-name node)) :phase phase)
3630 ((and (member (node-name-tuple node) +special-elements+
3632 (not (member (node-name node)
3633 '("address" "div" "p")
3636 (when (element-in-scope "p" "button")
3637 (process-end-tag (implied-tag-token "p") :phase phase))
3638 (insert-element token)
3641 (def :in-body start-tag-plaintext (tokenizer)
3642 (when (element-in-scope "p" "button")
3643 (end-tag-p (implied-tag-token "p")))
3644 (insert-element token)
3645 (setf (tokenizer-state tokenizer) :plaintext-state)
3648 (def :in-body start-tag-heading (open-elements)
3649 (when (element-in-scope "p" "button")
3650 (end-tag-p (implied-tag-token "p")))
3651 (when (member (node-name (last-open-element)) +heading-elements+
3653 (perror :unexpected-start-tag :name (getf token :name))
3654 (pop-end open-elements))
3655 (insert-element token)
3658 (def :in-body start-tag-a (open-elements active-formatting-elements)
3659 (let ((afe-a-element (element-in-active-formatting-elements "a")))
3661 (perror :unexpected-start-tag-implies-end-tag
3662 :start-name "a" :end-name "a")
3663 (end-tag-formatting (implied-tag-token "a"))
3664 (when (member afe-a-element open-elements)
3666 (remove afe-a-element open-elements)))
3667 (when (member afe-a-element active-formatting-elements)
3668 (setf active-formatting-elements
3669 (remove afe-a-element active-formatting-elements))))
3670 (reconstruct-active-formatting-elements)
3671 (add-formatting-element token))
3674 (def :in-body start-tag-formatting ()
3675 (reconstruct-active-formatting-elements)
3676 (add-formatting-element token)
3679 (def :in-body start-tag-nobr ()
3680 (reconstruct-active-formatting-elements)
3681 (when (element-in-scope "nobr")
3682 (perror :unexpected-start-tag-implies-end-tag
3683 :start-name "nobr" :end-name "nobr")
3684 (process-end-tag (implied-tag-token "nobr"))
3685 ;; XXX Need tests that trigger the following
3686 (reconstruct-active-formatting-elements))
3687 (add-formatting-element token)
3690 (def :in-body start-tag-button (frameset-ok)
3691 (cond ((element-in-scope "button")
3692 (perror :unexpected-start-tag-implies-end-tag
3693 :start-name "button" :end-name "button")
3694 (process-end-tag (implied-tag-token "button"))
3697 (reconstruct-active-formatting-elements)
3698 (insert-element token)
3699 (setf frameset-ok nil)
3702 (def :in-body start-tag-applet-marquee-object (frameset-ok active-formatting-elements)
3703 (reconstruct-active-formatting-elements)
3704 (insert-element token)
3705 (push-end :marker active-formatting-elements)
3706 (setf frameset-ok nil)
3709 (def :in-body start-tag-xmp (frameset-ok)
3710 (when (element-in-scope "p" "button")
3711 (end-tag-p (implied-tag-token "p")))
3712 (reconstruct-active-formatting-elements)
3713 (setf frameset-ok nil)
3714 (parse-rc-data-raw-text token :rawtext)
3717 (def :in-body start-tag-table (frameset-ok compat-mode phase)
3718 (when (not (eq compat-mode :quirks))
3719 (when (element-in-scope "p" "button")
3720 (end-tag-p (implied-tag-token "p"))))
3721 (insert-element token)
3722 (setf frameset-ok nil)
3723 (setf phase :in-table)
3726 (def :in-body start-tag-void-formatting (frameset-ok open-elements)
3727 (reconstruct-active-formatting-elements)
3728 (insert-element token)
3729 (pop-end open-elements)
3730 (setf (getf token :self-closing-acknowledged) t)
3731 (setf frameset-ok nil)
3734 (def :in-body start-tag-input (frameset-ok)
3735 (let ((old-frameset-ok frameset-ok))
3736 (start-tag-void-formatting token)
3737 (let ((type (assoc "type" (getf token :data) :test #'string=)))
3739 (string= (ascii-upper-2-lower (cdr type)) "hidden"))
3740 ;;input type=hidden doesn't change framesetOK
3741 (setf frameset-ok old-frameset-ok))))
3744 (def :in-body start-tag-param-source (open-elements)
3745 (insert-element token)
3746 (pop-end open-elements)
3747 (setf (getf token :self-closing-acknowledged) t)
3750 (def :in-body start-tag-hr (frameset-ok open-elements)
3751 (when (element-in-scope "p" "button")
3752 (end-tag-p (implied-tag-token "p")))
3753 (insert-element token)
3754 (pop-end open-elements)
3755 (setf (getf token :self-closing-acknowledged) t)
3756 (setf frameset-ok nil)
3759 (def :in-body start-tag-image ()
3760 (perror :unexpected-start-tag-treated-as
3761 :original-name "image" :new-name "img")
3762 (process-start-tag (implied-tag-token/full
3764 :attributes (getf token :data)
3765 :self-closing (getf token :self-closing)))
3768 (def :in-body start-tag-is-index (form-pointer)
3770 (perror :deprecated-tag :name "isindex")
3774 (when (assoc "action" (getf token :data) :test #'string=)
3775 (setf attrs (list (assoc "action" (getf token :data) :test #'string=))))
3776 (process-start-tag (implied-tag-token/full "form" :start-tag
3777 :attributes attrs)))
3778 (process-start-tag (implied-tag-token "hr" :start-tag))
3779 (process-start-tag (implied-tag-token "label" :start-tag))
3780 ;; XXX Localization ...
3781 (let ((prompt (if (assoc "prompt" (getf token :data) :test #'string=)
3782 (cdr (assoc "prompt" (getf token :data) :test #'string=))
3783 "This is a searchable index. Enter search keywords: ")))
3784 (process-characters (list :type :characters :data prompt)))
3785 (let ((attrs (append (remove-if (lambda (el)
3786 (member (car el) '("action" "prompt" "name")
3788 (copy-list (getf token :data)))
3789 (copy-list '(("name" . "isindex"))))))
3790 (process-start-tag (implied-tag-token/full "input" :start-tag
3793 (getf token :self-closing))))
3794 (process-end-tag (implied-tag-token "label"))
3795 (process-start-tag (implied-tag-token "hr" :start-tag))
3796 (process-end-tag (implied-tag-token "form")))
3799 (def :in-body start-tag-textarea (tokenizer
3800 in-body-process-space-characters-mode
3802 (insert-element token)
3803 (setf (tokenizer-state tokenizer) :rcdata-state)
3804 (setf in-body-process-space-characters-mode :drop-newline)
3805 (setf frameset-ok nil)
3808 (def :in-body start-tag-i-frame (frameset-ok)
3809 (setf frameset-ok nil)
3810 (start-tag-rawtext token)
3813 (def :in-body start-tag-rawtext ()
3814 ;;;iframe, noembed noframes, noscript(if scripting enabled)
3815 (parse-rc-data-raw-text token :rawtext)
3818 (def :in-body start-tag-opt (phase)
3819 (when (string= (node-name (last-open-element)) "option")
3820 (process-end-tag (implied-tag-token "option") :phase phase))
3821 (reconstruct-active-formatting-elements)
3822 (insert-element token)
3825 (def :in-body start-tag-select (frameset-ok)
3826 (reconstruct-active-formatting-elements)
3827 (insert-element token)
3828 (setf frameset-ok nil)
3829 (if (member (parser-phase *parser*) '(:in-table :in-caption :in-column-group
3830 :in-table-body :in-row :in-cell))
3831 (setf (parser-phase *parser*) :in-select-in-table)
3832 (setf (parser-phase *parser*) :in-select))
3835 (def :in-body start-tag-rp-rt ()
3836 (when (element-in-scope "ruby")
3837 (generate-implied-end-tags)
3838 (when (string/= (node-name (last-open-element)) "ruby")
3839 (perror :expected-ruby-tag)))
3840 (insert-element token)
3843 (def :in-body start-tag-math (open-elements)
3844 (reconstruct-active-formatting-elements)
3845 (adjust-math-ml-attributes token)
3846 (adjust-foreign-attributes token)
3847 (setf (getf token :namespace) (find-namespace "mathml"))
3848 (insert-element token)
3849 ;;Need to get the parse error right for the case where the token
3850 ;;has a namespace not equal to the xmlns attribute
3851 (when (getf token :self-closing)
3852 (pop-end open-elements)
3853 (setf (getf token :self-closing-acknowledged) t))
3856 (def :in-body start-tag-svg (open-elements)
3857 (reconstruct-active-formatting-elements)
3858 (adjust-svg-attributes token)
3859 (adjust-foreign-attributes token)
3860 (setf (getf token :namespace) (find-namespace "svg"))
3861 (insert-element token)
3862 ;;Need to get the parse error right for the case where the token
3863 ;;has a namespace not equal to the xmlns attribute
3864 (when (getf token :self-closing)
3865 (pop-end open-elements)
3866 (setf (getf token :self-closing-acknowledged) t))
3869 (def :in-body start-tag-misplaced ()
3870 ;;; Elements that should be children of other elements that have a
3871 ;;; different insertion mode; here they are ignored
3872 ;;; "caption", "col", "colgroup", "frame", "frameset", "head",
3873 ;;; "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
3874 ;;; "tr", "noscript"
3875 (perror :unexpected-start-tag-ignored :name (getf token :name))
3878 (def :in-body start-tag-other ()
3879 (reconstruct-active-formatting-elements)
3880 (insert-element token)
3883 (def :in-body end-tag-p (open-elements)
3884 (cond ((not (element-in-scope "p" "button"))
3885 (start-tag-close-p (implied-tag-token "p" :start-tag))
3886 (perror :unexpected-end-tag :name "p")
3887 (end-tag-p (implied-tag-token "p")))
3889 (generate-implied-end-tags "p")
3890 (when (string/= (node-name (last-open-element)) "p")
3891 (perror :unexpected-end-tag :name "p"))
3892 (let ((node (pop-end open-elements)))
3893 (loop until (string= (node-name node) "p")
3894 do (setf node (pop-end open-elements))))))
3897 (def :in-body end-tag-body (open-elements)
3899 (when (not (element-in-scope "body"))
3900 (perror :unexpected-scope)
3902 (when (string/= (node-name (last-open-element)) "body")
3903 (loop for node in (cddr open-elements)
3904 do (when (member (node-name node)
3905 '("dd" "dt" "li" "optgroup" "option" "p" "rp"
3906 "rt" "tbody" "td" "tfoot" "th" "thead" "tr"
3909 ;;Not sure this is the correct name for the parse error
3910 (perror :expected-one-end-tag-but-got-another
3911 :expected-name "body" :got-name (node-name node))
3913 (setf (parser-phase *parser*) :after-body)
3916 (def :in-body end-tag-html ()
3917 ;;We repeat the test for the body end tag token being ignored here
3918 (cond ((element-in-scope "body")
3919 (end-tag-body (implied-tag-token "body"))
3923 (def :in-body end-tag-block (in-body-process-space-characters-mode open-elements)
3924 ;;Put us back in the right whitespace handling mode
3925 (when (string= (getf token :name) "pre")
3926 (setf in-body-process-space-characters-mode :non-pre))
3927 (let ((in-scope (element-in-scope (getf token :name))))
3929 (generate-implied-end-tags))
3930 (when (string/= (node-name (last-open-element))
3932 (perror :end-tag-too-early :name (getf token :name)))
3934 (let ((node (pop-end open-elements)))
3935 (loop until (string= (node-name node) (getf token :name))
3936 do (setf node (pop-end open-elements))))))
3939 (def :in-body end-tag-form (form-pointer open-elements)
3940 (let ((node form-pointer))
3941 (setf form-pointer nil)
3942 (if (or (null node) (not (element-in-scope (node-name node))))
3943 (perror :unexpected-end-tag :name "form")
3945 (generate-implied-end-tags)
3946 (when (not (equal (last-open-element) node))
3947 (perror :end-tag-too-early-ignored :name "form"))
3949 (remove node open-elements)))))
3953 ;;; - A token is a plist.
3954 ;;; - A property is an alist.
3955 ;;; - A node is an object.
3956 ;;; - An element is a node.
3958 (def :in-body end-tag-list-item (open-elements)
3959 (let ((variant (if (string= (getf token :name) "li")
3962 (if (not (element-in-scope (getf token :name) variant))
3963 (perror :unexpected-end-tag :name (getf token :name))
3965 (generate-implied-end-tags (getf token :name))
3966 (when (string/= (node-name (last-open-element))
3968 (perror :end-tag-too-early :name (getf token :name)))
3969 (let ((node (pop-end open-elements)))
3970 (loop until (string= (node-name node) (getf token :name))
3971 do (setf node (pop-end open-elements)))))))
3974 (def :in-body end-tag-heading (open-elements)
3975 (loop for item in +heading-elements+
3976 do (when (element-in-scope item)
3977 (generate-implied-end-tags)
3979 (when (string/= (node-name (last-open-element))
3981 (perror :end-tag-too-early :name (getf token :name)))
3982 (loop for item in +heading-elements+
3983 do (when (element-in-scope item)
3984 (let ((item (pop-end open-elements)))
3985 (loop until (member (node-name item) +heading-elements+
3987 do (setf item (pop-end open-elements))))))
3990 (defmacro insert-elt-at (object index place)
3991 (let ((tmp (gensym "TMP"))
3992 (object-symbol (gensym "OBJECT"))
3993 (index-symbol (gensym "INDEX")))
3994 `(let ((,object-symbol ,object)
3995 (,index-symbol ,index)
3997 (setf ,place (append (subseq ,tmp 0 (min ,index-symbol (length ,tmp)))
3998 (list ,object-symbol)
3999 (nthcdr ,index-symbol ,tmp))))))
4001 (def :in-body end-tag-formatting (active-formatting-elements open-elements)
4002 ;; The much-feared adoption agency algorithm
4003 ;; http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
4004 ;; XXX Better parseError messages appreciated.
4006 with name = (getf token :name)
4007 with outer-loop-counter = 0
4008 with formatting-element
4013 with inner-loop-counter
4016 with common-ancestor
4018 while (< outer-loop-counter 8)
4020 (incf outer-loop-counter)
4022 ;; Step 1 paragraph 1
4023 (setf formatting-element
4024 (element-in-active-formatting-elements name))
4025 (cond ((or (not formatting-element)
4026 (and (member formatting-element
4028 (not (element-in-scope
4029 (node-name formatting-element)))))
4030 (perror :adoption-agency-1.1 :name name)
4031 (return-from outer nil))
4033 ;; Step 1 paragraph 2
4034 ((not (member formatting-element
4036 (perror :adoption-agency-1.2 :name name)
4037 (setf active-formatting-elements
4038 (remove formatting-element active-formatting-elements))
4039 (return-from outer nil)))
4041 ;; Step 1 paragraph 3
4042 (unless (eql formatting-element
4043 (last-open-element))
4044 (perror :adoption-agency-1.3 :name name))
4048 ;; Start of the adoption agency algorithm proper
4049 (setf afe-index (position formatting-element
4051 (setf furthest-block nil)
4052 (loop for element in (subseq open-elements
4054 do (when (member (node-name-tuple element)
4057 (setf furthest-block element)
4060 (when (null furthest-block)
4061 (loop for element = (pop-end open-elements)
4062 until (eql formatting-element element)
4063 finally (setf active-formatting-elements
4065 active-formatting-elements)))
4066 (return-from outer nil))
4067 (setf common-ancestor (elt open-elements (- afe-index 1)))
4070 ;;if furthestBlock.parent:
4071 ;; furthestBlock.parent.removeChild(furthestBlock)
4074 ;; The bookmark is supposed to help us
4075 ;; identify where to reinsert nodes in step
4076 ;; 12. We have to ensure that we reinsert
4077 ;; nodes after the node before the active
4078 ;; formatting element. Note the bookmark can
4080 (setf bookmark (position formatting-element
4081 active-formatting-elements))
4084 (setf node furthest-block)
4085 (setf last-node node)
4086 (setf inner-loop-counter 0)
4088 (setf index (position node open-elements))
4090 while (< inner-loop-counter 3)
4093 (incf inner-loop-counter)
4094 ;; Node is element before node in open elements
4096 (setf node (elt open-elements index))
4097 (when (not (member node active-formatting-elements))
4099 (remove node open-elements))
4100 (return-from continue))
4102 (when (eql node formatting-element)
4103 (return-from inner))
4105 (when (eql last-node furthest-block)
4106 (setf bookmark (1+ (position node
4107 active-formatting-elements))))
4109 (setf clone (node-clone* node))
4110 ;; Replace node with clone
4112 ((af active-formatting-elements)
4114 (setf (elt af (position node af)) clone)
4115 (setf (elt oe (position node oe)) clone))
4119 ;; Remove lastNode from its parents, if any
4120 (when (node-parent last-node)
4121 (node-remove-child (node-parent last-node)
4123 (node-append-child node last-node)
4126 (setf last-node node)
4127 ;; End of inner loop
4131 ;; Foster parent lastNode if commonAncestor is a
4132 ;; table, tbody, tfoot, thead, or tr we need to
4133 ;; foster parent the lastNode
4134 (when (node-parent last-node)
4135 (node-remove-child (node-parent last-node)
4138 (if (member (node-name common-ancestor)
4139 '("table" "tbody" "tfoot" "thead" "tr")
4141 (multiple-value-bind (parent insert-before)
4142 (get-table-misnested-nodeposition)
4143 (node-insert-before* parent last-node insert-before))
4144 (node-append-child* common-ancestor last-node))
4147 (setf clone (node-clone* formatting-element))
4150 (node-reparent-children furthest-block clone)
4153 (node-append-child* furthest-block clone)
4156 (setf active-formatting-elements
4157 (remove formatting-element
4158 active-formatting-elements))
4159 (insert-elt-at clone bookmark active-formatting-elements)
4163 (remove formatting-element
4165 (insert-elt-at clone
4166 (1+ (position furthest-block
4171 (def :in-body end-tag-applet-marquee-object (open-elements)
4172 (when (element-in-scope (getf token :name))
4173 (generate-implied-end-tags))
4174 (when (string/= (node-name (last-open-element))
4176 (perror :end-tag-too-early :name (getf token :name)))
4177 (when (element-in-scope (getf token :name))
4178 (let ((element (pop-end open-elements)))
4179 (loop until (string= (node-name element) (getf token :name))
4180 do (setf element (pop-end open-elements))))
4181 (clear-active-formatting-elements))
4184 (def :in-body end-tag-br (open-elements)
4185 (perror :unexpected-end-tag-treated-as
4186 :original-name "br" :new-name "br element")
4187 (reconstruct-active-formatting-elements)
4188 (insert-element (implied-tag-token "br" :start-tag))
4189 (pop-end open-elements)
4192 (def :in-body end-tag-other (open-elements)
4193 (loop for node in (reverse open-elements)
4194 do (cond ((string= (node-name node) (getf token :name))
4195 (generate-implied-end-tags (getf token :name))
4196 (when (string/= (node-name (last-open-element))
4198 (perror :unexpected-end-tag :name (getf token :name)))
4199 (loop while (not (eq node
4200 (pop-end open-elements))))
4203 (when (member (node-name-tuple node) +special-elements+
4205 (perror :unexpected-end-tag :name (getf token :name))
4212 (tagname-dispatch :text process-start-tag
4213 (default start-tag-other))
4215 (tagname-dispatch :text process-end-tag
4216 ("script" end-tag-script)
4217 (default end-tag-other))
4219 (def :text process-characters ()
4220 (parser-insert-text (getf token :data))
4223 (def :text process-eof (phase original-phase open-elements)
4224 (perror :expected-named-closing-tag-but-got-eof
4225 (node-name (last-open-element)))
4226 (pop-end open-elements)
4227 (setf phase original-phase)
4230 (def :text start-tag-other ()
4231 (error "Tried to process start tag ~S in RCDATA/RAWTEXT mode" (getf token :name)))
4233 (def :text end-tag-script (phase original-phase open-elements)
4234 (assert (string= (node-name (pop-end open-elements))
4236 (setf phase original-phase)
4237 ;; The rest of this method is all stuff that only happens if
4238 ;; document.write works
4241 (def :text end-tag-other (phase original-phase open-elements)
4242 (pop-end open-elements)
4243 (setf phase original-phase)
4248 ;; http://www.whatwg.org/specs/web-apps/current-work/#in-table
4250 (tagname-dispatch :in-table process-start-tag
4251 ("html" start-tag-html)
4252 ("caption" start-tag-caption)
4253 ("colgroup" start-tag-colgroup)
4254 ("col" start-tag-col)
4255 (("tbody" "tfoot" "thead") start-tag-row-group)
4256 (("td" "th" "tr") start-tag-imply-tbody)
4257 ("table" start-tag-table)
4258 (("style" "script") start-tag-style-script)
4259 ("input" start-tag-input)
4260 ("form" start-tag-form)
4261 (default start-tag-other))
4263 (tagname-dispatch :in-table process-end-tag
4264 ("table" end-Tag-Table)
4265 (("body" "caption" "col" "colgroup" "html" "tbody" "td"
4266 "tfoot" "th" "thead" "tr") end-Tag-Ignore)
4267 (default end-tag-other))
4269 (flet ((clear-stack-to-table-context ()
4270 ;; clear the stack back to a table context
4271 (loop until (member (node-name (last-open-element))
4275 ;;(perror :unexpected-implied-end-tag-in-table
4276 ;; :name (node-name* (last-open-element)))
4277 (pop-end (slot-value *parser* 'open-elements)))
4278 ;; When the current node is <html> it's an innerHTML case
4281 (def :in-table process-eof (inner-html)
4282 (if (string/= (node-name (last-open-element)) "html")
4283 (perror :eof-in-table)
4284 (assert inner-html))
4288 (def :in-table process-space-characters (phase original-phase)
4289 (setf original-phase phase)
4290 (setf phase :in-table-text)
4291 (process-space-characters token :phase phase)
4294 (def :in-table process-characters (phase original-phase)
4295 (setf original-phase phase)
4296 (setf phase :in-table-text)
4297 (process-characters token :phase phase)
4300 (def :in-table insert-text (insert-from-table)
4301 ;; If we get here there must be at least one non-whitespace character
4302 ;; Do the table magic!
4303 (setf insert-from-table t)
4304 (process-characters token :phase :in-body)
4305 (setf insert-from-table nil)
4308 (def :in-table start-tag-caption (phase active-formatting-elements)
4309 (clear-stack-to-table-context)
4310 (push-end :marker active-formatting-elements)
4311 (insert-element token)
4312 (setf phase :in-caption)
4315 (def :in-table start-tag-colgroup (phase)
4316 (clear-stack-to-table-context)
4317 (insert-element token)
4318 (setf phase :in-column-group)
4321 (def :in-table start-tag-col ()
4322 (start-tag-colgroup (implied-tag-token "colgroup" :start-tag))
4325 (def :in-table start-tag-row-group (phase)
4326 (clear-stack-to-table-context)
4327 (insert-element token)
4328 (setf phase :in-table-body)
4331 (def :in-table start-tag-imply-tbody ()
4332 (start-tag-row-group (implied-tag-token "tbody" :start-tag))
4335 (def :in-table start-tag-table (phase inner-html)
4336 (perror :unexpected-start-tag-implies-end-tag
4339 (process-end-tag (implied-tag-token "table") :phase phase)
4343 (def :in-table start-tag-style-script ()
4344 (process-start-tag token :phase :in-head))
4346 (def :in-table start-tag-input (open-elements)
4347 (let ((type (assoc "type" (getf token :data) :test #'string=)))
4349 (string= (ascii-upper-2-lower (cdr type)) "hidden"))
4350 (perror :unexpected-hidden-input-in-table)
4351 (insert-element token)
4352 ;; XXX associate with form
4353 (pop-end open-elements))
4355 (start-tag-other token))))
4358 (def :in-table start-tag-form (form-pointer open-elements)
4359 (perror :unexpected-form-in-table)
4360 (unless form-pointer
4361 (insert-element token)
4362 (setf form-pointer (last-open-element))
4363 (pop-end open-elements))
4366 (def :in-table start-tag-other (insert-from-table)
4367 (perror :unexpected-start-tag-implies-table-voodoo :name (getf token :name))
4368 ;; Do the table magic!
4369 (setf insert-from-table t)
4370 (process-start-tag token :phase :in-body)
4371 (setf insert-from-table nil)
4374 (def :in-table end-tag-table (inner-html open-elements)
4375 (cond ((element-in-scope "table" "table")
4376 (generate-implied-end-tags)
4377 (unless (equal (node-name (last-open-element)) "table")
4378 (perror :end-tag-too-early-named
4380 :expected-name (node-name (last-open-element))))
4381 (loop until (equal (node-name (last-open-element)) "table")
4382 do (pop-end open-elements))
4383 (pop-end open-elements)
4384 (reset-insertion-mode))
4388 (perror :end-tag-table-in-table-inner-html-case)))
4391 (def :in-table end-tag-ignore ()
4392 (perror :unexpected-end-tag :name (getf token :name))
4395 (def :in-table end-tag-other (insert-from-table)
4396 (perror :unexpected-end-tag-implies-table-voodoo :name (getf token :name))
4397 ;; Do the table magic!
4398 (setf insert-from-table t)
4399 (process-end-tag token :phase :in-body)
4400 (setf insert-from-table nil)
4406 (defun flush-characters ()
4407 (with-slots (character-tokens) *parser*
4408 (let ((data (apply #'concatenate 'string
4409 (loop for item in (reverse character-tokens)
4410 collect (getf item :data)))))
4411 (if (not (only-space-characters-p data))
4412 (insert-text (list :type :characters
4415 (parser-insert-text data)))
4416 (setf character-tokens nil)))
4418 (def :in-table-text process-comment (phase original-phase)
4420 (setf phase original-phase)
4423 (def :in-table-text process-eof (phase original-phase)
4425 (setf phase original-phase)
4428 (def :in-table-text process-characters (character-tokens)
4429 (unless (equal (getf token :data) (string #\u0000))
4430 (push token character-tokens))
4433 (def :in-table-text process-space-characters (character-tokens)
4434 ;; pretty sure we should never reach here
4435 (push token character-tokens)
4438 (def :in-table-text process-start-tag (phase original-phase)
4440 (setf phase original-phase)
4443 (def :in-table-text process-end-tag (phase original-phase)
4445 (setf phase original-phase)
4450 ;; http://www.whatwg.org/specs/web-apps/current-work/#in-caption
4452 (tagname-dispatch :in-caption process-start-tag
4453 ("html" start-tag-html)
4454 (("caption" "col" "colgroup" "tbody" "td" "tfoot" "th"
4455 "thead" "tr") start-tag-table-element)
4456 (default start-tag-other))
4458 (tagname-dispatch :in-caption process-end-tag
4459 ("caption" end-tag-caption)
4460 ("table" end-tag-table)
4461 (("body" "col" "colgroup" "html" "tbody" "td" "tfoot" "th"
4462 "thead" "tr") end-tag-ignore)
4463 (default end-tag-other))
4465 (flet ((ignore-end-tag-caption ()
4466 (not (element-in-scope "caption" "table"))))
4468 (def :in-caption process-eof ()
4469 (process-eof token :phase :in-body))
4471 (def :in-caption process-characters ()
4472 (process-characters token :phase :in-body))
4474 (def :in-caption start-tag-table-element (phase)
4475 (perror :start-tag-table-element-in-caption)
4476 ;; XXX Have to duplicate logic here to find out if the tag is ignored
4477 (prog1 (unless (ignore-end-tag-caption)
4479 (process-end-tag (implied-tag-token "caption") :phase phase)))
4481 (def :in-caption start-tag-other ()
4482 (process-start-tag token :phase :in-body))
4484 (def :in-caption end-tag-caption (phase inner-html open-elements)
4485 (cond ((not (ignore-end-tag-caption))
4486 ;; AT this code is quite similar to endTagTable in "InTable"
4487 (generate-implied-end-tags)
4488 (unless (equal (node-name (last-open-element)) "caption")
4489 (perror :expected-one-end-tag-but-got-another
4491 :expected-name (node-name (last-open-element))))
4492 (loop until (equal (node-name (last-open-element)) "caption")
4493 do (pop-end open-elements))
4494 (clear-active-formatting-elements)
4495 (setf phase :in-table))
4499 (perror :end-tag-caption-in-caption-inner-html-mode)))
4502 (def :in-caption end-tag-table (phase)
4503 (perror :end-tag-table-in-caption)
4504 (prog1 (unless (ignore-end-tag-caption)
4506 (process-end-tag (implied-tag-token "caption") :phase phase)))
4508 (def :in-caption end-tag-ignore ()
4509 (perror :unexpected-end-tag :name (getf token :name))
4512 (def :in-caption end-tag-other ()
4513 (process-end-tag token :phase :in-body)))
4516 ;; InColumnGroupPhase
4517 ;; http://www.whatwg.org/specs/web-apps/current-work/#in-column
4519 (tagname-dispatch :in-column-group process-start-tag
4520 ("html" start-tag-html)
4521 ("col" start-tag-col)
4522 (default start-tag-other))
4524 (tagname-dispatch :in-column-group process-end-tag
4525 ("colgroup" end-tag-colgroup)
4527 (default end-tag-other))
4530 (flet ((ignore-end-tag-colgroup ()
4531 (string= (node-name (last-open-element)) "html")))
4533 (def :in-column-group process-eof (inner-html)
4534 (cond ((string= (node-name (last-open-element)) "html")
4538 (let ((ignore-end-tag (ignore-end-tag-colgroup)))
4539 (end-tag-colgroup (implied-tag-token "colgroup"))
4540 (not ignore-end-tag)))))
4542 (def :in-column-group process-characters ()
4543 (prog1 (unless (ignore-end-tag-colgroup)
4545 (end-tag-colgroup (implied-tag-token "colgroup"))))
4547 (def :in-column-group start-tag-col (open-elements)
4548 (insert-element token)
4549 (pop-end open-elements)
4552 (def :in-column-group start-tag-other ()
4553 (prog1 (unless (ignore-end-tag-colgroup)
4555 (end-tag-colgroup (implied-tag-token "colgroup"))))
4557 (def :in-column-group end-tag-colgroup (phase open-elements)
4558 (cond ((ignore-end-tag-colgroup)
4560 (perror :end-tag-colgroup-in-column-group-inner-html-mode))
4562 (pop-end open-elements)
4563 (setf phase :in-table)))
4566 (def :in-column-group end-tag-col ()
4567 (perror :no-end-tag :name "col")
4570 (def :in-column-group end-tag-other ()
4571 (prog1 (unless (ignore-end-tag-colgroup)
4573 (end-tag-colgroup (implied-tag-token "colgroup")))))
4577 ;; http://www.whatwg.org/specs/web-apps/current-work/#in-table0
4579 (tagname-dispatch :in-table-body process-start-tag
4580 ("html" start-tag-html)
4582 (("td" "th") start-tag-table-cell)
4583 (("caption" "col" "colgroup" "tbody" "tfoot" "thead") start-tag-table-other)
4584 (default start-tag-other))
4586 (tagname-dispatch :in-table-body process-end-tag
4587 (("tbody" "tfoot" "thead") end-Tag-Table-Row-Group)
4588 ("table" end-Tag-Table)
4589 (("body" "caption" "col" "colgroup" "html" "td" "th" "tr") end-Tag-Ignore)
4590 (default end-tag-other))
4592 (flet ((clear-stack-to-table-body-context ()
4593 (loop until (member (node-name (last-open-element))
4594 '("tbody" "tfoot" "thead" "html")
4597 ;;(perror :unexpected-implied-end-tag-in-table
4598 ;; :name (node-name (last-open-element)))
4599 (pop-end (slot-value *parser* 'open-elements)))
4600 (when (string= (node-name (last-open-element)) "html")
4601 (assert (slot-value *parser* 'inner-html)))))
4603 (def :in-table-body process-eof ()
4604 (process-eof token :phase :in-table))
4606 (def :in-table-body process-space-characters ()
4607 (process-space-characters token :phase :in-table))
4609 (def :in-table-body process-characters ()
4610 (process-characters token :phase :in-table))
4612 (def :in-table-body start-tag-tr (phase)
4613 (clear-stack-to-table-body-context)
4614 (insert-element token)
4615 (setf phase :in-row)
4618 (def :in-table-body start-tag-table-cell ()
4619 (perror :unexpected-cell-in-table-body :name (getf token :name))
4620 (start-tag-tr (implied-tag-token "tr" :start-tag))
4623 (def :in-table-body start-tag-table-other (inner-html)
4624 ;; XXX AT Any ideas on how to share this with endTagTable?
4625 (cond ((or (element-in-scope "tbody" "table")
4626 (element-in-scope "thead" "table")
4627 (element-in-scope "tfoot" "table"))
4628 (clear-stack-to-table-body-context)
4629 (end-tag-table-row-group
4630 (implied-tag-token (node-name (last-open-element))))
4635 (perror :start-tag-table-other-in-table-body-inner-html-mode)
4638 (def :in-table-body start-tag-other ()
4639 (process-start-tag token :phase :in-table))
4641 (def :in-table-body end-tag-table-row-group (phase open-elements)
4642 (cond ((element-in-scope (getf token :name) "table")
4643 (clear-stack-to-table-body-context)
4644 (pop-end open-elements)
4645 (setf phase :in-table))
4647 (perror :unexpected-end-tag-in-table-body :name (getf token :name))))
4650 (def :in-table-body end-tag-table (inner-html)
4651 (cond ((or (element-in-scope "tbody" "table")
4652 (element-in-scope "thead" "table")
4653 (element-in-scope "tfoot" "table"))
4654 (clear-stack-to-table-body-context)
4655 (end-tag-table-row-group
4656 (implied-tag-token (node-name (last-open-element))))
4661 (perror :end-tag-table-other-in-table-body-inner-html-mode)
4664 (def :in-table-body end-tag-ignore ()
4665 (perror :unexpected-end-tag-in-table-body :name (getf token :name))
4668 (def :in-table-body end-tag-other ()
4669 (process-end-tag token :phase :in-table)))
4672 ;; http://www.whatwg.org/specs/web-apps/current-work/#in-row
4674 (tagname-dispatch :in-row process-start-tag
4675 ("html" start-tag-html)
4676 (("td" "th") start-tag-table-cell)
4677 (("caption" "col" "colgroup" "tbody" "tfoot" "thead" "tr")
4678 start-tag-table-other)
4679 (default start-tag-other))
4681 (tagname-dispatch :in-row process-end-tag
4683 ("table" end-tag-table)
4684 (("tbody" "tfoot" "thead") end-tag-table-row-group)
4685 (("body" "caption" "col" "colgroup" "html" "td" "th") end-tag-ignore)
4686 (default end-tag-other))
4689 ;; helper methods (XXX unify this with other table helper methods)
4690 (flet ((clear-stack-to-table-row-context ()
4691 (loop until (member (node-name (last-open-element))
4695 (perror :unexpected-implied-end-tag-in-table-row
4696 :name (node-name (last-open-element)))
4697 (pop-end (slot-value *parser* 'open-elements))))
4699 (ignore-end-tag-tr ()
4700 (not (element-in-scope "tr" "table"))))
4703 (def :in-row process-eof ()
4704 (process-eof token :phase :in-table)
4707 (def :in-row process-space-characters ()
4708 (process-space-characters token :phase :in-table))
4710 (def :in-row process-characters ()
4711 (process-characters token :phase :in-table))
4713 (def :in-row start-tag-table-cell (phase active-formatting-elements)
4714 (clear-stack-to-table-row-context)
4715 (insert-element token)
4716 (setf phase :in-cell)
4717 (push-end :marker active-formatting-elements)
4720 (def :in-row start-tag-table-other ()
4721 (let ((ignore-end-tag (ignore-end-tag-tr)))
4722 (end-tag-tr (implied-tag-token "tr"))
4723 ;; XXX how are we sure it's always ignored in the innerHTML case?
4724 (unless ignore-end-tag
4727 (def :in-row start-tag-other ()
4728 (process-start-tag token :phase :in-table))
4730 (def :in-row end-tag-tr (phase inner-html open-elements)
4731 (cond ((not (ignore-end-tag-tr))
4732 (clear-stack-to-table-row-context)
4733 (pop-end open-elements)
4734 (setf phase :in-table-body))
4738 (perror :end-tag-tr-inner-html-mode)))
4741 (def :in-row end-tag-table ()
4742 (let ((ignore-end-tag (ignore-end-tag-tr)))
4743 (end-tag-tr (implied-tag-token "tr"))
4744 ;; Reprocess the current tag if the tr end tag was not ignored
4745 ;; XXX how are we sure it's always ignored in the innerHTML case?
4746 (unless ignore-end-tag
4749 (def :in-row end-tag-table-row-group ()
4750 (cond ((element-in-scope (getf token :name) "table")
4751 (end-tag-tr (implied-tag-token "tr"))
4754 (perror :end-tag-table-row-group-something-wrong)
4757 (def :in-row end-tag-ignore ()
4758 (perror :unexpected-end-tag-in-table-row (getf token :name))
4761 (def :in-row end-tag-other ()
4762 (process-end-tag token :phase :in-table)))
4766 ;; http://www.whatwg.org/specs/web-apps/current-work/#in-cell
4768 (tagname-dispatch :in-cell process-start-tag
4769 ("html" start-tag-html)
4770 (("caption" "col" "colgroup" "tbody" "td" "tfoot" "th" "thead" "tr")
4771 start-tag-table-other)
4772 (default start-tag-other))
4774 (tagname-dispatch :in-cell process-end-tag
4775 (("td" "th") end-tag-table-cell)
4776 (("body" "caption" "col" "colgroup" "html") end-tag-ignore)
4777 (("table" "tbody" "tfoot" "thead" "tr") end-tag-imply)
4778 (default end-tag-other))
4780 (flet ((close-cell ()
4781 (if (element-in-scope "td" "table")
4782 (end-tag-table-cell (implied-tag-token "td"))
4783 (if (element-in-scope "th" "table")
4784 (end-tag-table-cell (implied-tag-token "th"))))))
4786 (def :in-cell process-eof ()
4787 (process-eof token :phase :in-body)
4790 (def :in-cell process-characters ()
4791 (process-characters token :phase :in-body))
4793 (def :in-cell start-tag-table-other (inner-html)
4794 (cond ((or (element-in-scope "td" "table")
4795 (element-in-scope "th" "table"))
4801 (perror :start-tag-table-other-in-inner-html-mode)
4804 (def :in-cell start-tag-other ()
4805 (process-start-tag token :phase :in-body))
4807 (def :in-cell end-tag-table-cell (phase open-elements)
4808 (cond ((element-in-scope (getf token :name) "table")
4809 (generate-implied-end-tags (getf token :name))
4810 (cond ((not (equal (node-name (last-open-element))
4811 (getf token :name)))
4812 (perror :unexpected-cell-end-tag :name (getf token :name))
4813 (loop until (equal (node-name (pop-end open-elements))
4814 (getf token :name))))
4816 (pop-end open-elements)))
4817 (clear-active-formatting-elements)
4818 (setf phase :in-row))
4820 (perror :unexpected-end-tag :name (getf token :name))))
4823 (def :in-cell end-tag-ignore ()
4824 (perror :unexpected-end-tag :name (getf token :name))
4827 (def :in-cell end-tag-imply ()
4828 (cond ((element-in-scope (getf token :name) "table")
4832 ;; sometimes innerHTML case
4833 (perror :end-tag-imply-sometimes-inner-html-case)
4836 (def :in-cell end-tag-other ()
4837 (process-end-tag token :phase :in-body)))
4842 (tagname-dispatch :in-select process-start-tag
4843 ("html" start-tag-html)
4844 ("option" start-tag-option)
4845 ("optgroup" start-tag-optgroup)
4846 ("select" start-tag-select)
4847 (("input" "keygen" "textarea") start-tag-input)
4848 ("script" start-tag-script)
4849 (default start-tag-other))
4851 (tagname-dispatch :in-select process-end-tag
4852 ("option" end-tag-option)
4853 ("optgroup" end-tag-optgroup)
4854 ("select" end-tag-select)
4855 (default end-tag-other))
4857 ;; http://www.whatwg.org/specs/web-apps/current-work/#in-select
4858 (def :in-select process-eof (inner-html)
4859 (if (not (equal (node-name (last-open-element)) "html"))
4860 (perror :eof-in-select)
4861 (assert inner-html))
4864 (def :in-select process-characters ()
4865 (unless (equal (getf token :data) (string #\u0000))
4866 (parser-insert-text (getf token :data)))
4869 (def :in-select start-tag-option (open-elements)
4870 ;; We need to imply </option> if <option> is the current node.
4871 (when (equal (node-name (last-open-element)) "option")
4872 (pop-end open-elements))
4873 (insert-element token)
4876 (def :in-select start-tag-optgroup (open-elements)
4877 (when (equal (node-name (last-open-element)) "option")
4878 (pop-end open-elements))
4879 (when (equal (node-name (last-open-element)) "optgroup")
4880 (pop-end open-elements))
4881 (insert-element token)
4884 (def :in-select start-tag-select ()
4885 (perror :unexpected-select-in-select)
4886 (end-tag-select (implied-tag-token "select"))
4889 (def :in-select start-tag-input (inner-html)
4890 (perror :unexpected-input-in-select)
4891 (cond ((element-in-scope "select" "select")
4892 (end-tag-select (implied-tag-token "select"))
4898 (def :in-select start-tag-script ()
4899 (process-start-tag token :phase :in-head))
4901 (def :in-select start-tag-other ()
4902 (perror :unexpected-start-tag-in-select :name (getf token :name))
4905 (def :in-select end-tag-option (open-elements)
4906 (if (equal (node-name (last-open-element)) "option")
4907 (pop-end open-elements)
4908 (perror :unexpected-end-tag-in-select :name (getf token :name)))
4911 (def :in-select end-tag-optgroup (open-elements)
4912 ;; </optgroup> implicitly closes <option>
4913 (when (and (equal (node-name (last-open-element)) "option")
4914 (equal (node-name (elt open-elements
4915 (- (length open-elements) 2)))
4917 (pop-end open-elements))
4918 ;; It also closes </optgroup>
4919 (if (equal (node-name (last-open-element)) "optgroup")
4920 (pop-end open-elements)
4922 (perror :unexpected-end-tag-in-select :name (getf token :name)))
4925 (def :in-select end-tag-select (inner-html open-elements)
4926 (cond ((element-in-scope "select" "select")
4927 (loop until (equal (node-name (pop-end open-elements))
4929 (reset-insertion-mode))
4933 (perror :end-tag-select-in-inner-html-mode)))
4936 (def :in-select end-tag-other ()
4937 (perror :unexpected-end-tag-in-select :name (getf token :name))
4941 ;; InSelectInTablePhase
4943 (tagname-dispatch :in-select-in-table process-start-tag
4944 (("caption" "table" "tbody" "tfoot" "thead" "tr" "td" "th") start-tag-table)
4945 (default start-tag-other))
4947 (tagname-dispatch :in-select-in-table process-end-tag
4948 (("caption" "table" "tbody" "tfoot" "thead" "tr" "td" "th") end-tag-table)
4949 (default end-tag-other))
4951 (def :in-select-in-table process-eof ()
4952 (process-eof token :phase :in-select)
4955 (def :in-select-in-table process-characters ()
4956 (process-characters token :phase :in-select))
4958 (def :in-select-in-table start-tag-table ()
4959 (perror :unexpected-table-element-start-tag-in-select-in-table :name (getf token :name))
4960 (end-tag-other (implied-tag-token "select"))
4963 (def :in-select-in-table start-tag-other ()
4964 (process-start-tag token :phase :in-select))
4966 (def :in-select-in-table end-tag-table ()
4967 (perror :unexpected-table-element-end-tag-in-select-in-table :name (getf token :name))
4968 (cond ((element-in-scope (getf token :name) "table")
4969 (end-tag-other (implied-tag-token "select"))
4974 (def :in-select-in-table end-tag-other ()
4975 (process-end-tag token :phase :in-select))
4978 ;; InForeignContentPhase
4980 (defparameter +breakout-elements+
4981 '("b" "big" "blockquote" "body" "br"
4982 "center" "code" "dd" "div" "dl" "dt"
4983 "em" "embed" "h1" "h2" "h3"
4984 "h4" "h5" "h6" "head" "hr" "i" "img"
4985 "li" "listing" "menu" "meta" "nobr"
4986 "ol" "p" "pre" "ruby" "s" "small"
4987 "span" "strong" "strike" "sub" "sup"
4988 "table" "tt" "u" "ul" "var"))
4991 (defun adjust-svg-tag-names (token)
4992 (let ((replacement (cdr
4993 (assoc (getf token :name)
4994 '(("altglyph" . "altGlyph")
4995 ("altglyphdef" . "altGlyphDef")
4996 ("altglyphitem" . "altGlyphItem")
4997 ("animatecolor" . "animateColor")
4998 ("animatemotion" . "animateMotion")
4999 ("animatetransform" . "animateTransform")
5000 ("clippath" . "clipPath")
5001 ("feblend" . "feBlend")
5002 ("fecolormatrix" . "feColorMatrix")
5003 ("fecomponenttransfer" . "feComponentTransfer")
5004 ("fecomposite" . "feComposite")
5005 ("feconvolvematrix" . "feConvolveMatrix")
5006 ("fediffuselighting" . "feDiffuseLighting")
5007 ("fedisplacementmap" . "feDisplacementMap")
5008 ("fedistantlight" . "feDistantLight")
5009 ("feflood" . "feFlood")
5010 ("fefunca" . "feFuncA")
5011 ("fefuncb" . "feFuncB")
5012 ("fefuncg" . "feFuncG")
5013 ("fefuncr" . "feFuncR")
5014 ("fegaussianblur" . "feGaussianBlur")
5015 ("feimage" . "feImage")
5016 ("femerge" . "feMerge")
5017 ("femergenode" . "feMergeNode")
5018 ("femorphology" . "feMorphology")
5019 ("feoffset" . "feOffset")
5020 ("fepointlight" . "fePointLight")
5021 ("fespecularlighting" . "feSpecularLighting")
5022 ("fespotlight" . "feSpotLight")
5023 ("fetile" . "feTile")
5024 ("feturbulence" . "feTurbulence")
5025 ("foreignobject" . "foreignObject")
5026 ("glyphref" . "glyphRef")
5027 ("lineargradient" . "linearGradient")
5028 ("radialgradient" . "radialGradient")
5029 ("textpath" . "textPath"))
5032 (setf (getf token :name) replacement))))
5035 (defparameter +only-space-characters-regexp+
5036 (cl-ppcre:create-scanner `(:sequence :start-anchor
5039 (:alternation ,@(coerce +space-characters+ 'list)))
5041 :multi-line-mode t))
5043 (defun only-space-characters-p (string)
5044 (cl-ppcre:scan +only-space-characters-regexp+ string))
5046 (def :in-foreign-content process-characters (frameset-ok)
5047 (cond ((equal (getf token :data) (string #\u0000))
5048 (setf (getf token :data) (string #\uFFFD)))
5050 (not (only-space-characters-p (getf token :data))))
5051 (setf frameset-ok nil)))
5052 (process-characters token :phase nil)
5055 (def :in-foreign-content process-start-tag (html-namespace open-elements)
5057 (let ((current-node (last-open-element)))
5058 (cond ((or (member (getf token :name) +breakout-elements+ :test #'string=)
5059 (and (string= (getf token :name) "font")
5060 (intersection (mapcar #'car (getf token :data))
5061 '("color" "face" "size")
5063 (parser-parse-error :unexpected-html-element-in-foreign-content
5065 (loop until (or (is-html-integration-point (last-open-element))
5066 (is-math-ml-text-integration-point (last-open-element))
5067 (equal (node-namespace (last-open-element))
5069 do (pop-end open-elements))
5072 (cond ((equal (node-namespace current-node) (find-namespace "mathml"))
5073 (adjust-math-ml-attributes token))
5074 ((equal (node-namespace current-node) (find-namespace "svg"))
5075 (adjust-svg-tag-names token)
5076 (adjust-svg-attributes token)))
5077 (adjust-foreign-attributes token)
5078 (setf (getf token :namespace) (node-namespace current-node))
5079 (insert-element token)
5080 (when (getf token :self-closing)
5081 (pop-end open-elements)
5082 (setf (getf token :self-closing-acknowledged) t)))))
5085 (def :in-foreign-content process-end-tag (phase original-phase html-namespace open-elements)
5087 (node-index (1- (length open-elements)))
5088 (node (last-open-element)))
5089 (unless (string= (node-name node) (getf token :name))
5090 (parser-parse-error :unexpected-end-tag (getf token :name)))
5093 (when (string= (ascii-upper-2-lower (node-name node)) (getf token :name))
5094 ;; XXX this isn't in the spec but it seems necessary
5095 (when (eql phase :in-table-text)
5097 (setf phase original-phase))
5098 (loop until (eql (pop-end open-elements) node)
5099 do (assert open-elements))
5100 (setf new-token nil)
5104 (setf node (elt open-elements node-index))
5105 (when (equal (node-namespace node)
5107 (setf new-token (process-end-tag token :phase phase))
5113 (tagname-dispatch :after-body process-start-tag
5114 ("html" start-tag-html)
5115 (default start-tag-other))
5117 (tagname-dispatch :after-body process-end-tag
5118 ("html" end-tag-html)
5119 (default end-tag-other))
5121 (def :after-body process-eof ()
5125 (def :after-body process-comment (open-elements)
5126 ;; This is needed because data is to be appended to the <html> element
5127 ;; here and not to whatever is currently open.
5128 (insert-comment token (first open-elements))
5131 (def :after-body process-characters (phase)
5132 (parser-parse-error :unexpected-char-after-body)
5133 (setf phase :in-body)
5136 (def :after-body start-tag-html ()
5137 (process-start-tag token :phase :in-body))
5139 (def :after-body start-tag-other (phase)
5140 (parser-parse-error :unexpected-start-tag-after-body
5141 `(:name ,(getf token :name)))
5142 (setf phase :in-body)
5145 (def :after-body end-tag-html (inner-html phase)
5147 (parser-parse-error :unexpected-end-tag-after-body-innerhtml)
5148 (setf phase :after-after-body))
5151 (def :after-body end-tag-other (phase)
5152 (parser-parse-error :unexpected-end-tag-after-body
5153 `(:name ,(getf token :name)))
5154 (setf phase :in-body)
5159 (tagname-dispatch :in-frameset process-start-tag
5160 ("html" start-tag-html)
5161 ("frameset" start-tag-frameset)
5162 ("frame" start-tag-frame)
5163 ("noframes"start-tag-noframes)
5164 (default start-tag-other))
5166 (tagname-dispatch :in-frameset process-end-tag
5167 ("frameset" end-tag-frameset)
5168 (default end-tag-other))
5171 (def :in-frameset process-eof (inner-html)
5172 (if (string/= (node-name (last-open-element)) "html")
5173 (parser-parse-error :eof-in-frameset)
5174 (assert inner-html))
5177 (def :in-frameset process-characters ()
5178 (parser-parse-error :unexpected-char-in-frameset)
5181 (def :in-frameset start-tag-frameset ()
5182 (insert-element token)
5185 (def :in-frameset start-tag-frame (open-elements)
5186 (insert-element token)
5187 (pop-end open-elements)
5190 (def :in-frameset start-tag-noframes ()
5191 (process-start-tag token :phase :in-body))
5193 (def :in-frameset start-tag-other ()
5194 (parser-parse-error :unexpected-start-tag-in-frameset
5195 `(:name ,(getf token :name)))
5198 (def :in-frameset end-tag-frameset (phase inner-html open-elements)
5199 (if (string= (node-name (last-open-element)) "html")
5201 (parser-parse-error :unexpected-frameset-in-frameset-innerhtml)
5202 (pop-end open-elements))
5204 (when (and (not inner-html)
5205 (string/= (node-name (last-open-element)) "frameset"))
5206 ;; If we're not in innerHTML mode and the the current node is not a
5207 ;; "frameset" element (anymore) then switch.
5208 (setf phase :after-frameset))
5211 (def :in-frameset end-tag-other ()
5212 (parser-parse-error :unexpected-end-tag-in-frameset
5213 `(:name ,(getf token :name)))
5217 ;; AfterFramesetPhase
5219 (tagname-dispatch :after-frameset process-start-tag
5220 ("html" start-tag-html)
5221 ("noframes" start-tag-noframes)
5222 (default start-tag-other))
5224 (tagname-dispatch :after-frameset process-end-tag
5225 ("html" end-tag-html)
5226 (default end-tag-other))
5228 (def :after-frameset process-eof ()
5232 (def :after-frameset process-characters ()
5233 (parser-parse-error :unexpected-char-after-frameset)
5236 (def :after-frameset start-tag-noframes ()
5237 (process-start-tag token :phase :in-head))
5239 (def :after-frameset start-tag-other ()
5240 (parser-parse-error :unexpected-start-tag-after-frameset
5241 `(:name ,(getf token :name)))
5244 (def :after-frameset end-tag-html (phase)
5245 (setf phase :after-after-frameset)
5248 (def :after-frameset end-tag-other ()
5249 (parser-parse-error :unexpected-end-tag-after-frameset
5250 `(:name ,(getf token :name)))
5253 ;; AfterAfterBodyPhase
5255 (tagname-dispatch :after-after-body process-start-tag
5256 ("html" start-tag-html)
5257 (default start-tag-other))
5259 (def :after-after-body process-eof ()
5262 (def :after-after-body process-comment ()
5263 (insert-comment token (document*))
5266 (def :after-after-body process-space-characters ()
5267 (process-space-characters token :phase :in-body))
5269 (def :after-after-body process-characters (phase)
5270 (parser-parse-error :expected-eof-but-got-char)
5271 (setf phase :in-body)
5274 (def :after-after-body start-tag-html ()
5275 (process-start-tag token :phase :in-body))
5277 (def :after-after-body start-tag-other (phase)
5278 (parser-parse-error :expected-eof-but-got-start-tag
5279 `(:name (getf token :name)))
5280 (setf phase :in-body)
5283 (def :after-after-body process-end-tag (phase)
5284 (parser-parse-error :expected-eof-but-got-end-tag
5285 `(:name (getf token :name)))
5286 (setf phase :in-body)
5289 ;; AfterAfterFramesetPhase
5291 (tagname-dispatch :after-after-frameset process-start-tag
5292 ("html" start-tag-html)
5293 ("noframes" start-tag-noframes)
5294 (default start-tag-other))
5296 (def :after-after-frameset process-eof ()
5299 (def :after-after-frameset process-comment ()
5300 (insert-comment token (document*))
5303 (def :after-after-frameset process-space-characters ()
5304 (process-space-characters token :phase :in-body))
5306 (def :after-after-frameset process-characters ()
5307 (parser-parse-error :expected-eof-but-got-char)
5310 (def :after-after-frameset start-tag-html ()
5311 (process-start-tag token :phase :in-body))
5313 (def :after-after-frameset start-tag-noframes ()
5314 (process-start-tag token :phase :in-head))
5316 (def :after-after-frameset start-tag-other ()
5317 (parser-parse-error :expected-eof-but-got-start-tag
5318 `(:name (getf token :name)))
5321 (def :after-after-frameset process-end-tag ()
5322 (parser-parse-error :expected-eof-but-got-end-tag
5323 `(:name (getf token :name)))
5327 (defun xml-escape-name (name)
5328 "Escapes a node name (element, attribute, doctype) by replacing any
5329 character not valid in XML name by Uxxxxxx, where x is the code point
5330 as six hex digits. This encoding is reversable, since the HTML parser
5331 down cases all characters in names.
5333 See: https://www.w3.org/TR/html5/syntax.html#coercing-an-html-dom-into-an-infoset"
5334 (if (and (xml-name-start-char-p (char name 0))
5335 (every #'xml-name-char-p name))
5337 (with-output-to-string (out)
5338 (loop for first = t then nil
5339 for c across name do
5341 (xml-name-start-char-p c)
5342 (xml-name-char-p c))
5344 (format out "U~:@(~6,'0X~)" (char-code c)))))))
5347 (defun xml-unescape-name (name)
5348 "Reverert escaping done by xml-unescape-name."
5349 (cl-ppcre:regex-replace-all
5353 (string (code-char (parse-integer u :start 1 :radix 16))))
5357 (defun xml-name-start-char-p (c)
5358 (or (char<= #\a c #\z)
5361 (char<= (code-char #xC0) c (code-char #xD6))
5362 (char<= (code-char #xD8) c (code-char #xF6))
5363 (char<= (code-char #xF8) c (code-char #x2FF))
5364 (char<= (code-char #x370) c (code-char #x37D))
5365 (char<= (code-char #x37F) c (code-char #x1FFF))
5366 (char<= (code-char #x200C) c (code-char #x200D))
5367 (char<= (code-char #x2070) c (code-char #x218F))
5368 (char<= (code-char #x2C00) c (code-char #x2FEF))
5369 (char<= (code-char #x3001) c (code-char #xD7FF))
5370 (char<= (code-char #xF900) c (code-char #xFDCF))
5371 (char<= (code-char #xFDF0) c (code-char #xFFFD))
5372 (char<= (code-char #x10000) c (code-char #xEFFFF))))
5375 (defun xml-name-char-p (c)
5376 (or (xml-name-start-char-p c)
5380 (char= (code-char #xB7) c)
5381 (char<= (code-char #x0300) c (code-char #x036F))
5382 (char<= (code-char #x203F) c (code-char #x2040))))
5386 (defmethod transform-html5-dom ((to-type (eql :xmls)) node
5387 &key namespace comments)
5388 "Convert a node into an XMLS-compatible tree of conses, starting
5389 at. If the node is a document-fragement a list of XMLS trees is returned."
5390 (labels ((node-to-xmls (node parent-ns xlink-defined)
5391 (ecase (node-type node)
5394 (element-map-children (lambda (n)
5395 (when (string= (node-name n) "html")
5399 (node-to-xmls root parent-ns xlink-defined)))
5402 (element-map-children (lambda (node)
5403 (push (node-to-xmls node parent-ns xlink-defined)
5406 (nreverse xmls-nodes)))
5408 (let (attrs children)
5409 (element-map-attributes (lambda (name node-namespace value)
5410 (when (and namespace
5412 (equal node-namespace (find-namespace "xlink")))
5413 (push '#.(list "xmlns:xlink" (find-namespace "xlink")) attrs)
5414 (setf xlink-defined t))
5415 (push (list (if node-namespace
5417 (xml-escape-name name))
5421 (element-map-children (lambda (c)
5427 (not (equal parent-ns (node-namespace node))))
5428 (cons (node-name node) (node-namespace node))
5429 (xml-escape-name (node-name node)))
5432 (node-to-xmls c (node-namespace node) xlink-defined))
5433 (nreverse children)))))
5438 (list :comment nil (node-value node)))))))
5439 (node-to-xmls node nil nil)))
5442 (defmethod transform-html5-dom ((to-type (eql :xmls-ns)) node &key)
5443 (transform-html5-dom :xmls node :namespace t))