changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / lisp/lib/dat/html/html.lisp

changeset 667: bb8aa1eda12b
parent: a3b65a8138ac
author: Richard Westhaver <ellis@rwest.io>
date: Mon, 23 Sep 2024 17:03:54 -0400
permissions: -rw-r--r--
description: graph, css vars, corfu-terminal fix
1 ;;; dat/html.lisp --- HTML parser
2 
3 ;; see https://github.com/rotatef/cl-html5-parser
4 
5 ;; spec: https://html.spec.whatwg.org/
6 
7 ;;; Commentary:
8 
9 ;; HTML is usually associated with XML, but not all HTML is valid
10 ;; XML.
11 
12 ;; This package provides a pretty good HTML parser, with the default
13 ;; DOM being the one used in our XML package DAT/XML.
14 
15 ;;; Code:
16 
17 ;;; inputstream
18 (in-package :dat/html)
19 
20 (deftype array-length ()
21  "Type of an array index."
22  '(integer 0 #.array-dimension-limit))
23 
24 (deftype chunk ()
25  "Type of the input stream buffer."
26  '(vector character *))
27 
28 (defparameter *default-encoding* :utf-8)
29 
30 (defclass html-input-stream ()
31  ((source :initarg :source)
32  (encoding :reader html5-stream-encoding)
33  (char-stream :initform nil)
34  (chunk)
35  (chunk-offset)
36  (pending-cr)
37  (errors :initform nil :accessor html5-stream-errors)))
38 
39 (defun make-html-input-stream (source &key override-encoding fallback-encoding)
40  (when (stringp source)
41  ;; Encoding is not relevant when input is a string,
42  ;; but we set it utf-8 here to avoid auto detecting taking place.
43  (setf override-encoding :utf-8))
44  (let ((self (make-instance 'html-input-stream :source source)))
45  (with-slots (encoding stream) self
46  (setf encoding (detect-encoding self
47  (find-encoding override-encoding)
48  (find-encoding fallback-encoding)))
49  (open-char-stream self))
50  self))
51 
52 ;; 12.2.2.2 Character encodings
53 (defun find-encoding (encoding-name)
54  ;; Normalize the string designator
55  (setf encoding-name (string-upcase (substitute #\- #\_ (string-trim +space-characters+ (string encoding-name)))))
56  ;; All known encoding will already be interned in the keyword package so find-symbol is fine here
57  (setf encoding-name (find-symbol encoding-name :keyword))
58 
59  (handler-case
60  ;; Verfiy that flexi-streams knows the encoding and resolve aliases
61  (case (flex:external-format-name (flex:make-external-format encoding-name))
62  ;; Some encoding should be replaced by some other.
63  ;; Only those supported by flexi-streams are listed here.
64  ;; iso-8859-11 should be replaced by windows-874, but flexi-streams doesn't that encoding.
65  (:iso-8859-1 :windows-1252)
66  (:iso-8859-9 :windows-1254)
67  (:us-ascii :windows-1252)
68  (otherwise encoding-name))
69  (flex:external-format-error ())))
70 
71 ;; 12.2.2.1 Determining the character encoding
72 (defun detect-encoding (stream override-encoding fallback-encoding)
73  (with-slots (encoding) stream
74  (block nil
75  ;; 1. and 2. encoding overridden by user or transport layer
76  (when override-encoding
77  (return (cons override-encoding :certain)))
78 
79  ;; 3. wait for 1024 bytes, not implemented
80 
81  ;; 4. Detect BOM
82  (let ((bom-encoding (detect-bom stream)))
83  (when bom-encoding
84  (return (cons bom-encoding :certain))))
85 
86  ;; 5. Prescan not implemented
87 
88  ;; 6. Use fallback encoding
89  (when fallback-encoding
90  (return (cons encoding :tentative)))
91 
92  ;; 7. Autodect not implemented
93 
94  ;; 8. Implementation-defined default
95  (return (cons *default-encoding* :tentative)))))
96 
97 (defmacro handle-encoding-errors (stream &body body)
98  `(handler-bind ((flex:external-format-encoding-error
99  (lambda (x)
100  (declare (ignore x))
101  (push :invalid-codepoint (html5-stream-errors ,stream))
102  (use-value #\uFFFD))))
103  ,@body))
104 
105 (defun open-char-stream (self)
106  (with-slots (source encoding char-stream chunk chunk-offset pending-cr) self
107  (setf chunk (make-array (* 10 1024) :element-type 'character :fill-pointer 0))
108  (setf chunk-offset 0)
109  (setf pending-cr nil)
110  (when char-stream
111  (close char-stream))
112  (setf char-stream
113  (if (stringp source)
114  (make-string-input-stream source)
115  (flex:make-flexi-stream
116  (etypecase source
117  (pathname
118  (open source :element-type '(unsigned-byte 8)))
119  (stream
120  source)
121  (vector
122  (flex:make-in-memory-input-stream source)))
123  :external-format (flex:make-external-format (car encoding) :eol-style :lf))))
124  ;; 12.2.2.4 says we should always skip the first byte order mark
125  (handle-encoding-errors self
126  (let ((first-char (peek-char nil char-stream nil)))
127  (when (eql first-char #\ufeff)
128  (read-char char-stream))))))
129 
130 (defun detect-bom (self)
131  (with-slots (source) self
132  (let (byte-0 byte-1 byte-2)
133  (etypecase source
134  (vector
135  (when (> (length source) 0) (setf byte-0 (aref source 0)))
136  (when (> (length source) 1) (setf byte-1 (aref source 1)))
137  (when (> (length source) 2) (setf byte-2 (aref source 2))))
138  (pathname
139  (with-open-file (in source :element-type '(unsigned-byte 8))
140  (setf byte-0 (read-byte in nil))
141  (setf byte-1 (read-byte in nil))
142  (setf byte-2 (read-byte in nil))))
143  (stream
144  (error "Can't detect encoding when source is a stream.")))
145  (cond ((and (eql byte-0 #xfe)
146  (eql byte-1 #xff))
147  :utf-16be)
148  ((and (eql byte-0 #xff)
149  (eql byte-1 #xfe))
150  :utf-16le)
151  ((and (eql byte-0 #xef)
152  (eql byte-1 #xbb)
153  (eql byte-2 #xbf))
154  :utf-8)))))
155 
156 ;; 12.2.2.3 Changing the encoding while parsing
157 (defun html5-stream-change-encoding (stream new-encoding)
158  (setf new-encoding (find-encoding new-encoding))
159  (with-slots (encoding char-stream) stream
160  ;; 1.
161  (when (member (car encoding) '(:utf-16le :utf-16be))
162  (setf encoding (cons (car encoding) :certain))
163  (return-from html5-stream-change-encoding))
164 
165  ;; 2.
166  (when (member new-encoding '(:utf-16le :utf-16be))
167  (setf new-encoding :utf-8))
168 
169  ;; 3.
170  (when (eql (car encoding) new-encoding)
171  (setf encoding (cons (car encoding) :certain))
172  (return-from html5-stream-change-encoding))
173 
174  ;; 4. Not impleneted
175 
176  ;; 5. Restart paring from scratch
177  (setf encoding (cons new-encoding :certain))
178  (open-char-stream stream)
179  (throw 'please-reparse t)))
180 
181 (defun html5-stream-char (stream)
182  (with-slots (chunk chunk-offset) stream
183  (when (>= chunk-offset (length chunk))
184  (unless (read-chunk stream)
185  (return-from html5-stream-char +eof+)))
186  (prog1 (char chunk chunk-offset)
187  (incf chunk-offset))))
188 
189 (defun our-scan (chars opposite-p chunk &key start)
190  (loop for i from start below (length chunk)
191  for char = (char chunk i)
192  while (if opposite-p
193  (position char chars)
194  (not (position char chars)))
195  finally (return i)))
196 
197 (defun html5-stream-chars-until (stream characters &optional opposite-p)
198  "Returns a string of characters from the stream up to but not
199  including any character in characters or end of file.
200  "
201  (with-slots (chunk chunk-offset) stream
202  (declare (array-length chunk-offset) (chunk chunk))
203  (with-output-to-string (data)
204  (loop for end = (our-scan characters opposite-p chunk :start chunk-offset) do
205  ;; If nothing matched, and it wasn't because we ran out of chunk,
206  ;; then stop
207  (when (and (not end)
208  (/= chunk-offset (length chunk)))
209  (return))
210  ;; If not the whole chunk matched, return everything
211  ;; up to the part that didn't match
212  (when (and end
213  (/= chunk-offset (length chunk)))
214  (write-string chunk data :start chunk-offset :end end)
215  (setf chunk-offset end)
216  (return))
217  ;; If the whole remainder of the chunk matched,
218  ;; use it all and read the next chunk
219  (write-string chunk data :start chunk-offset)
220  (unless (read-chunk stream)
221  (return))))))
222 
223 (defun html5-stream-unget (stream char)
224  (with-slots (chunk chunk-offset) stream
225  (unless (eql char +eof+)
226  (cond ((zerop chunk-offset)
227  (cond ((< (fill-pointer chunk) (array-dimension chunk 0))
228  (incf (fill-pointer chunk))
229  (replace chunk chunk :start1 1))
230  (t
231  (let ((new-chunk (make-array (1+ (array-dimension chunk 0))
232  :element-type 'character
233  :fill-pointer (1+ (fill-pointer chunk)))))
234  (replace new-chunk chunk :start1 1)
235  (setf chunk new-chunk))))
236  (setf (char chunk 0) char))
237  (t
238  (decf chunk-offset)
239  (assert (char= char (char chunk chunk-offset))))))))
240 
241 (defun read-chunk (stream)
242  (declare (optimize speed))
243  (with-slots (char-stream chunk chunk-offset pending-cr) stream
244  (declare (array-length chunk-offset)
245  (chunk chunk))
246  (setf chunk-offset 0)
247  (let ((start 0))
248  (when pending-cr
249  (setf (char chunk 0) #\Return)
250  (setf start 1)
251  (setf pending-cr nil))
252 
253  (setf (fill-pointer chunk) (array-dimension chunk 0))
254  (handle-encoding-errors stream
255  (setf (fill-pointer chunk) (read-sequence chunk char-stream :start start)))
256 
257  (unless (zerop (length chunk))
258 
259  ;; check if last char is CR and EOF was not reached
260  (when (and (= (length chunk) (array-dimension chunk 0))
261  (eql (char chunk (1- (length chunk))) #\Return))
262  (setf pending-cr t)
263  (decf (fill-pointer chunk)))
264 
265  (report-character-errors stream chunk)
266 
267  ;; Python code replaces surrugate pairs with U+FFFD here. Why?
268 
269  ;; Normalize line endings (CR LF)
270  (loop for previous = nil then current
271  for current across chunk
272  for index of-type array-length from 0
273  with offset of-type array-length = 0
274  do (unless (and (eql previous #\Return)
275  (eql current #\Newline))
276  (unless (= index offset)
277  (setf (char chunk offset) current))
278  (when (eql current #\Return)
279  (setf (char chunk offset) #\Newline))
280  (incf offset))
281  finally (setf (fill-pointer chunk) offset))
282 
283  t))))
284 
285 (defun char-range (char1 char2)
286  (loop for i from (char-code char1) to (char-code char2)
287  collect (code-char i)))
288 
289 (defparameter *invalid-unicode*
290  `(,@(char-range #\u0001 #\u0008)
291  #\u000B
292  ,@(char-range #\u000E #\u001F)
293  ,@(char-range #\u007F #\u009F)
294  ;; The following are noncharacter as defined by Unicode.
295  ;; Clozure Common Lisp doesn't like them.
296  ,@`(
297  ,@(char-range #\uD800 #\uDFFF)
298  ,@(char-range #\uFDD0 #\uFDEF)
299  #\uFFFE
300  #\uFFFF
301  #\u0001FFFE
302  #\u0001FFFF
303  #\u0002FFFE
304  #\u0002FFFF
305  #\u0003FFFE
306  #\u0003FFFF
307  #\u0004FFFE
308  #\u0004FFFF
309  #\u0005FFFE
310  #\u0005FFFF
311  #\u0006FFFE
312  #\u0006FFFF
313  #\u0007FFFE
314  #\u0007FFFF
315  #\u0008FFFE
316  #\u0008FFFF
317  #\u0009FFFE
318  #\u0009FFFF
319  #\u000AFFFE
320  #\u000AFFFF
321  #\u000BFFFE
322  #\u000BFFFF
323  #\u000CFFFE
324  #\u000CFFFF
325  #\u000DFFFE
326  #\u000DFFFF
327  #\u000EFFFE
328  #\u000EFFFF
329  #\u000FFFFE
330  #\u000FFFFF
331  #\u0010FFFE
332  #\u0010FFFF)))
333 
334 (defparameter *invalid-unicode-hash* (make-hash-table))
335 (dolist (char *invalid-unicode*)
336  (setf (gethash char *invalid-unicode-hash*) char))
337 
338 (defun report-character-errors (stream data)
339  (loop for char across data
340  when (gethash char *invalid-unicode-hash*)
341  do (push :invalid-codepoint (html5-stream-errors stream))))
342 
343 ;;; Tokenizer
344 (defclass html-tokenizer ()
345  ((stream :initarg :stream :reader tokenizer-stream)
346  (cdata-switch-helper :initarg :cdata-switch-helper
347  :initform (constantly nil))
348  (lowercase-element-name :initform t)
349  (lowercase-attr-name :initform t)
350  (escape-flag :initform nil)
351  (last-four-chars :initform nil)
352  (state :initform :data-state :accessor tokenizer-state)
353  (escape :initform nil)
354  (current-token :initform nil)
355  (token-queue :initform nil)
356  (temporary-buffer :initform nil)))
357 
358 (defun make-html-tokenizer (source &key encoding cdata-switch-helper)
359  (make-instance 'html-tokenizer
360  :stream (make-html-input-stream source :override-encoding encoding)
361  :cdata-switch-helper cdata-switch-helper))
362 
363 (defun map-tokens (tokenizer function)
364  "Return next token or NIL on eof"
365  (with-slots (token-queue stream) tokenizer
366  (loop while (run-state tokenizer) do
367  (setf token-queue (nreverse token-queue))
368  (loop while (html5-stream-errors stream)
369  do (funcall function (list :type :parse-error :data (pop (html5-stream-errors stream)))))
370  (loop while token-queue
371  do (funcall function (pop token-queue))))))
372 
373 (defun run-state (tokenizer)
374  (run-state* tokenizer (slot-value tokenizer 'state)))
375 
376 (defgeneric run-state* (tokenizer state))
377 
378 (defmacro defstate (state (&rest slots) &body body)
379  `(defmethod run-state* (self (state (eql ,state)))
380  (with-slots (,@slots) self
381  (block nil
382  ,@body
383  t))))
384 
385 (defun push-token (self token)
386  (with-slots (token-queue) self
387  (push token token-queue)))
388 
389 (defun make-growable-string (&optional (init ""))
390  "Make an adjustable string with a fill pointer.
391 Given INIT, a string, return an adjustable version of it with the fill
392 pointer at the end."
393  (let ((string
394  (make-array (max 5 (length init))
395  :element-type 'character
396  :adjustable t
397  :fill-pointer (length init))))
398  (when init
399  (replace string init))
400  string))
401 
402 (defun nconcat (string &rest data)
403  "Destructively concatenate DATA, string designators, to STRING."
404  (declare (optimize speed))
405  (unless (array-has-fill-pointer-p string)
406  (setf string (make-growable-string string)))
407  (labels ((conc (string x)
408  (typecase x
409  (character
410  (vector-push-extend x string))
411  (string
412  (let ((len (length x)))
413  (loop for c across x do
414  (vector-push-extend c string len))))
415  (symbol (conc string (string x))))))
416  (dolist (x data string)
417  (conc string x))))
418 
419 (define-modify-macro nconcatf (&rest data) nconcat)
420 
421 (defun push-token* (self type &rest data)
422  "Push a token with :type type and :data the a string concatenation of data"
423  (push-token self (list :type type
424  :data (apply #'nconcat (make-growable-string) data))))
425 
426 (defun add-attribute (token name)
427  (setf (getf token :data) (append (getf token :data)
428  (list (cons (make-growable-string (string name))
429  (make-growable-string))))))
430 
431 (defun add-to-attr-name (token &rest data)
432  (setf (caar (last (getf token :data)))
433  (apply #'nconcat
434  (caar (last (getf token :data)))
435  data)))
436 
437 (defun add-to-attr-value (token &rest data)
438  (setf (cdar (last (getf token :data)))
439  (apply #'nconcat
440  (cdar (last (getf token :data)))
441  data)))
442 
443 (defun add-to (token indicator &rest data)
444  (setf (getf token indicator)
445  (apply #'nconcat
446  (getf token indicator)
447  data)))
448 
449 (defun consume-number-entity (self is-hex)
450  "This function returns either U+FFFD or the character based on the
451  decimal or hexadecimal representation. It also discards \";\" if present.
452  If not present a token (:type :parse-error) is emitted.
453  "
454  (with-slots (stream) self
455  (let ((allowed +digits+)
456  (radix 10)
457  (char-stack)
458  (c)
459  (char-as-int)
460  (char))
461  (when is-hex
462  (setf allowed +hex-digits+)
463  (setf radix 16))
464 
465  ;; Consume all the characters that are in range while making sure we
466  ;; don't hit an EOF.
467  (setf c (html5-stream-char stream))
468  (loop while (and (find c allowed) (not (eql c +eof+))) do
469  (push c char-stack)
470  (setf c (html5-stream-char stream)))
471 
472  ;; Convert the set of characters consumed to an int.
473  (setf char-as-int (parse-integer (coerce (nreverse char-stack) 'string) :radix radix))
474 
475  ;; Certain characters get replaced with others
476  (cond ((find char-as-int +replacement-characters+)
477  (setf char (getf +replacement-characters+ char-as-int))
478  (push-token self `(:type :parse-error
479  :data :illegal-codepoint-for-numeric-entity
480  :datavars '(:char-as-int ,char-as-int))))
481  ((or (<= #xD800 char-as-int #xDFFF)
482  (> char-as-int #x10FFFF))
483  (setf char #\uFFFD)
484  (push-token self `(:type :parse-error
485  :data :illegal-codepoint-for-numeric-entity
486  :datavars '(:char-as-int ,char-as-int))))
487  (t
488  ;; Python comment: Should speed up this check somehow (e.g. move the set to a constant)
489  (when (or (<= #x0001 char-as-int #x0008)
490  (<= #x000E char-as-int #x001F)
491  (<= #x007F char-as-int #x009F)
492  (<= #xFDD0 char-as-int #xFDEF)
493  (find char-as-int
494  #(#x000B #xFFFE #xFFFF #x1FFFE
495  #x1FFFF #x2FFFE #x2FFFF #x3FFFE
496  #x3FFFF #x4FFFE #x4FFFF #x5FFFE
497  #x5FFFF #x6FFFE #x6FFFF #x7FFFE
498  #x7FFFF #x8FFFE #x8FFFF #x9FFFE
499  #x9FFFF #xAFFFE #xAFFFF #xBFFFE
500  #xBFFFF #xCFFFE #xCFFFF #xDFFFE
501  #xDFFFF #xEFFFE #xEFFFF #xFFFFE
502  #xFFFFF #x10FFFE #x10FFFF)))
503  (push-token self `(:type :parse-error
504  :data :illegal-codepoint-for-numeric-entity
505  :datavars '(:char-as-int ,char-as-int))))
506  ;; Assume char-code-limit >= 1114112
507  (setf char (code-char char-as-int))))
508 
509  ;; Discard the ; if present. Otherwise, put it back on the queue and
510  ;; invoke parseError on parser.
511  (unless (eql c #\;)
512  (push-token self `(:type :parse-error :data :numeric-entity-without-semicolon))
513  (html5-stream-unget stream c))
514 
515  (string char))))
516 
517 (defun consume-entity (self &key allowed-char from-attribute)
518  (with-slots (stream current-token) self
519  (let ((output "&")
520  (stack (list (html5-stream-char stream))))
521  (cond ((or (find (car stack) +space-characters+)
522  (find (car stack) '(+eof+ #\< #\&))
523  (and allowed-char (eql allowed-char (car stack))))
524  (html5-stream-unget stream (car stack)))
525  ((eql (car stack) #\#)
526  (push (html5-stream-char stream) stack)
527  (let ((is-hex (find (car stack) "xX")))
528  (when is-hex
529  (push (html5-stream-char stream) stack))
530  (cond ((find (car stack) (if is-hex +hex-digits+ +digits+))
531  (html5-stream-unget stream (car stack))
532  (setf output (consume-number-entity self is-hex)))
533  (t
534  (push-token self '(:type :parse-error :data :expected-numeric-entity))
535  (html5-stream-unget stream (pop stack))
536  (when is-hex
537  (html5-stream-unget stream (pop stack)))
538  (html5-stream-unget stream (pop stack))))))
539  (t
540  ;; Consume the maximum number of characters possible, with the
541  ;; consumed characters matching one of the identifiers in the first
542  ;; column of the named character references table
543  ;; (in a case-sensitive manner).
544  (let ((entity)
545  (match-at 0))
546  (loop with node = *entities-tree*
547  for char = (car stack) then (car (push (html5-stream-char stream)
548  stack))
549  for next-node = (assoc char node)
550  while next-node
551  do (when (second next-node)
552  (setf entity (second next-node))
553  (setf match-at (length stack)))
554  do (setf node (cddr next-node)))
555  (let ((next-char))
556  ;; Unconsume those characters that are not part of the match
557  ;; This unconsumes everything if there where no match
558  (loop until (= (length stack) match-at) do
559  (setf next-char (car stack))
560  (html5-stream-unget stream (pop stack)))
561  (cond ((not entity)
562  ;; If no match can be made, then no characters are consumed, and nothing is returned.
563  ;; Is this always a parse error really?
564  (push-token self '(:type :parse-error :data :expected-named-entity)))
565  ((and from-attribute
566  (not (eql #\; (car stack)))
567  (or (eql next-char #\=)
568  (find next-char +digits+)
569  (ascii-letter-p next-char)))
570  ; Is this a parse error really?
571  (push-token self '(:type :parse-error :data :bogus))
572  (setf output (concatenate 'string "&" (reverse stack))))
573  (t
574  (unless (eql #\; (car stack))
575  (push-token self '(:type :parse-error
576  :data :named-entity-without-semicolon)))
577  (setf output entity)))))))
578 
579  (cond (from-attribute
580  (add-to-attr-value current-token output))
581  (t
582  (push-token* self (if (find (char output 0) +space-characters+)
583  :space-characters
584  :characters)
585  output))))))
586 
587 (defun process-entity-in-attribute (self &key allowed-char)
588  (consume-entity self :allowed-char allowed-char :from-attribute t))
589 
590 (defun emit-current-token (self)
591  "This method is a generic handler for emitting the tags. It also sets
592  the state to :data because that's what's needed after a token has been
593  emitted.
594  "
595  (with-slots (current-token state lowercase-element-name) self
596  (let ((token current-token))
597  ;; Add token to the queue to be yielded
598  (when (find (getf token :type) +tag-token-types+)
599  (when lowercase-element-name
600  (setf (getf token :name) (ascii-upper-2-lower (getf token :name))))
601  (when (eql (getf token :type) :end-tag)
602  (when (getf token :data)
603  (push-token self '(:type :parse-error :data :attributes-in-end-tag)))
604  (when (getf token :self-closing)
605  (push-token self '(:type :parse-error :data :self-closing-flag-on-end-tag)))))
606  (push-token self token)
607  (setf state :data-state))))
608 
609 ;;;
610 ;;; Below are the various tokenizer states worked out.
611 ;;;
612 
613 (defstate :data-state (stream state)
614  (let ((data (html5-stream-char stream)))
615  (cond ((eql data #\&)
616  (setf state :entity-data-state))
617  ((eql data #\<)
618  (setf state :tag-open-state))
619  ((eql data #\u0000)
620  (push-token self '(:type :parse-error :data :invalid-codepoint))
621  (push-token* self :characters #\u0000))
622  ((eql data +eof+)
623  ;; Tokenization ends.
624  (return nil))
625  ((find data +space-characters+)
626  ;; Directly after emitting a token you switch back to the "data
627  ;; state". At that point spaceCharacters are important so they are
628  ;; emitted separately.
629  (push-token* self :space-characters
630  data
631  (html5-stream-chars-until stream +space-characters+ t))
632  ;; No need to update lastFourChars here, since the first space will
633  ;; have already been appended to lastFourChars and will have broken
634  ;; any <!-- or --> sequences
635  )
636  (t
637  (push-token* self :characters
638  data
639  (html5-stream-chars-until stream '(#\& #\< #\u0000)))))))
640 
641 (defstate :entity-data-state (state)
642  (consume-entity self)
643  (setf state :data-state))
644 
645 (defstate :rcdata-state (stream state)
646  (let ((data (html5-stream-char stream)))
647  (cond ((eql data #\&)
648  (setf state :character-reference-in-rcdata))
649  ((eql data #\<)
650  (setf state :rcdata-less-than-sign-state))
651  ((eql data +eof+)
652  ;; Tokenization ends.
653  (return nil))
654  ((eql data #\u0000)
655  (push-token self '(:type :parse-error :data :invalid-codepoint))
656  (push-token* self :characters #\uFFFD))
657  ((find data +space-characters+)
658  ;; Directly after emitting a token you switch back to the "data
659  ;; state". At that point spaceCharacters are important so they are
660  ;; emitted separately.
661  (push-token* self :space-characters
662  data
663  (html5-stream-chars-until stream +space-characters+ t))
664  ;; No need to update lastFourChars here, since the first space will
665  ;; have already been appended to lastFourChars and will have broken
666  ;; any <!-- or --> sequences
667  )
668  (t
669  (push-token* self :characters
670  data
671  (html5-stream-chars-until stream '(#\& #\<)))))))
672 
673 (defstate :character-reference-in-rcdata (state)
674  (consume-entity self)
675  (setf state :rcdata-state))
676 
677 (defstate :rawtext-state (stream state)
678  (let ((data (html5-stream-char stream)))
679  (cond ((eql data #\<)
680  (setf state :rawtext-less-than-sign-state))
681  ((eql data #\u0000)
682  (push-token self '(:type :parse-error :data :invalid-codepoint))
683  (push-token* self :characters #\uFFFD))
684  ((eql data +eof+)
685  ;; Tokenization ends.
686  (return nil))
687  (t
688  (push-token* self :characters
689  data
690  (html5-stream-chars-until stream '(#\< #\u0000)))))))
691 
692 (defstate :script-data-state (stream state)
693  (let ((data (html5-stream-char stream)))
694  (cond ((eql data #\<)
695  (setf state :script-data-less-than-sign-state))
696  ((eql data #\u0000)
697  (push-token self '(:type :parse-error :data :invalid-codepoint))
698  (push-token* self :characters #\uFFFD))
699  ((eql data +eof+)
700  ;; Tokenization ends.
701  (return nil))
702  (t
703  (push-token* self :characters
704  data
705  (html5-stream-chars-until stream '(#\< #\u0000)))))))
706 
707 (defstate :plaintext-state (stream)
708  (let ((data (html5-stream-char stream)))
709  (cond ((eql data +eof+)
710  ;; Tokenization ends.
711  (return nil))
712  ((eql data #\u0000)
713  (push-token self '(:type :parse-error :data :invalid-codepoint))
714  (push-token* self :characters #\uFFFD))
715  (t
716  (push-token* self :characters
717  data
718  (html5-stream-chars-until stream '(#\u0000)))))))
719 
720 (defstate :tag-open-state (stream state current-token)
721  (let ((data (html5-stream-char stream)))
722  (cond ((eql data #\!)
723  (setf state :markup-declaration-open-state))
724  ((eql data #\/)
725  (setf state :close-tag-open-state))
726  ((ascii-letter-p data)
727  (setf current-token (list :type :start-tag
728  :name (make-array 1 :element-type 'character
729  :initial-element data
730  :fill-pointer t
731  :adjustable t)
732  :data '()
733  :self-closing nil
734  :self-closing-acknowledged nil))
735  (setf state :tag-name-state))
736  ((eql data #\>)
737  ;; XXX In theory it could be something besides a tag name. But
738  ;; do we really care?
739  (push-token self '(:type :parse-error :data :expected-tag-name-but-got-right-bracket))
740  (push-token* self :characters "<>")
741  (setf state :data-state))
742  ((eql data #\?)
743  ;; XXX In theory it could be something besides a tag name. But
744  ;; do we really care?
745  (push-token self '(:type :parse-error :data :expected-tag-name-but-got-question-mark))
746  (html5-stream-unget stream data)
747  (setf state :bogus-comment-state))
748  (t
749  ;; XXX
750  (push-token self '(:type :parse-error :data :expected-tag-name))
751  (push-token* self :characters "<")
752  (html5-stream-unget stream data)
753  (setf state :data-state)))))
754 
755 (defstate :close-tag-open-state
756  (stream state current-token)
757  (let ((data (html5-stream-char stream)))
758  (cond ((ascii-letter-p data)
759  (setf current-token (list :type :end-tag
760  :name (make-array 1 :element-type 'character
761  :initial-element data
762  :fill-pointer t
763  :adjustable t)
764  :data '()
765  :self-closing nil))
766  (setf state :tag-name-state))
767  ((eql data #\>)
768  (push-token self '(:type :parse-error :data :expected-closing-tag-but-got-right-bracket))
769  (setf state :data-state))
770  ((eql data +eof+)
771  (push-token self '(:type :parse-error :data :expected-closing-tag-but-got-eof))
772  (push-token* self :characters "</")
773  (setf state :data-state))
774  (t
775  ;; XXX data can be _'_...
776  (push-token self `(:type :parse-error :data :expected-closing-tag-but-got-char
777  :datavars (:data ,data)))
778  (html5-stream-unget stream data)
779  (setf state :bogus-comment-state))))
780  t)
781 
782 (defstate :tag-name-state (stream state current-token)
783  (let ((data (html5-stream-char stream)))
784  (cond ((find data +space-characters+)
785  (setf state :before-attribute-name-state))
786  ((eql data #\>)
787  (emit-current-token self))
788  ((eql data +eof+)
789  (push-token self '(:type :parse-error :data :eof-in-tag-name))
790  (setf state :data-state))
791  ((eql data #\/)
792  (setf state :self-closing-start-tag-state))
793  ((eql data #\u0000)
794  (push-token self '(:type :parse-error :data :invalid-codepoint))
795  (vector-push-extend #\uFFFD (getf current-token :name)))
796  (t
797  (vector-push-extend data (getf current-token :name))
798  ;; (Don't use charsUntil here, because tag names are
799  ;; very short and it's faster to not do anything fancy)
800  ))))
801 
802 (defstate :rcdata-less-than-sign-state (stream state temporary-buffer)
803  (let ((data (html5-stream-char stream)))
804  (cond ((eql data #\/)
805  (setf temporary-buffer (make-growable-string))
806  (setf state :rcdata-end-tag-open-state))
807  (t
808  (push-token* self :characters "<")
809  (html5-stream-unget stream data)
810  (setf state :rcdata-state)))))
811 
812 (defstate :rcdata-end-tag-open-state (stream state temporary-buffer)
813  (let ((data (html5-stream-char stream)))
814  (cond ((ascii-letter-p data)
815  (nconcatf temporary-buffer (string data))
816  (setf state :rcdata-end-tag-name-state))
817  (t
818  (push-token* self :characters "</")
819  (html5-stream-unget stream data)
820  (setf state :rcdata-state)))))
821 
822 (defstate :rcdata-end-tag-name-state (stream state temporary-buffer current-token)
823  (let ((appropriate (and current-token
824  (string-equal (getf current-token :name)
825  temporary-buffer)))
826  (data (html5-stream-char stream)))
827  (cond ((and (find data +space-characters+)
828  appropriate)
829  (setf current-token (list :type :end-tag
830  :name temporary-buffer
831  :data '()
832  :self-closing nil))
833  (setf state :before-attribute-name-state))
834  ((and (eql data #\/)
835  appropriate)
836  (setf current-token (list :type :end-tag
837  :name temporary-buffer
838  :data '()
839  :self-closing nil))
840  (setf state :self-closing-start-tag-state))
841  ((and (eql data #\>)
842  appropriate)
843  (setf current-token (list :type :end-tag
844  :name temporary-buffer
845  :data '()
846  :self-closing nil))
847  (emit-current-token self)
848  (setf state :data-state))
849  ((ascii-letter-p data)
850  (nconcatf temporary-buffer data))
851  (t
852  (push-token* self :characters "</" temporary-buffer)
853  (html5-stream-unget stream data)
854  (setf state :rcdata-state)))))
855 
856 (defstate :rawtext-less-than-sign-state (stream state temporary-buffer)
857  (let ((data (html5-stream-char stream)))
858  (cond ((eql data #\/)
859  (setf temporary-buffer (make-growable-string))
860  (setf state :rawtext-end-tag-open-state))
861  (t
862  (push-token* self :characters "<")
863  (html5-stream-unget stream data)
864  (setf state :rawtext-state)))))
865 
866 (defstate :rawtext-end-tag-open-state (stream state temporary-buffer)
867  (let ((data (html5-stream-char stream)))
868  (cond ((ascii-letter-p data)
869  (nconcatf temporary-buffer (string data))
870  (setf state :rawtext-end-tag-name-state))
871  (t
872  (push-token* self :characters "</")
873  (html5-stream-unget stream data)
874  (setf state :rawtext-state)))))
875 
876 (defstate :rawtext-end-tag-name-state (stream state temporary-buffer current-token)
877  (let ((appropriate (and current-token
878  (string-equal (getf current-token :name)
879  temporary-buffer)))
880  (data (html5-stream-char stream)))
881  (cond ((and (find data +space-characters+)
882  appropriate)
883  (setf current-token (list :type :end-tag
884  :name temporary-buffer
885  :data '()
886  :self-closing nil))
887  (setf state :before-attribute-name-state))
888  ((and (eql data #\/)
889  appropriate)
890  (setf current-token (list :type :end-tag
891  :name temporary-buffer
892  :data '()
893  :self-closing nil))
894  (setf state :self-closing-start-tag-state))
895  ((and (eql data #\>)
896  appropriate)
897  (setf current-token (list :type :end-tag
898  :name temporary-buffer
899  :data '()
900  :self-closing nil))
901  (emit-current-token self)
902  (setf state :data-state))
903  ((ascii-letter-p data)
904  (nconcatf temporary-buffer data))
905  (t
906  (push-token* self :characters "</" temporary-buffer)
907  (html5-stream-unget stream data)
908  (setf state :rawtext-state)))))
909 
910 (defstate :script-data-less-than-sign-state (stream state temporary-buffer)
911  (let ((data (html5-stream-char stream)))
912  (cond ((eql data #\/)
913  (setf temporary-buffer (make-growable-string))
914  (setf state :script-data-end-tag-open-state))
915  ((eql data #\!)
916  (push-token* self :characters "<!")
917  (setf state :script-data-escape-start-state))
918  (t
919  (push-token* self :characters "<")
920  (html5-stream-unget stream data)
921  (setf state :script-data-state)))))
922 
923 (defstate :script-data-end-tag-open-state (stream state temporary-buffer)
924  (let ((data (html5-stream-char stream)))
925  (cond ((ascii-letter-p data)
926  (nconcatf temporary-buffer data)
927  (setf state :script-data-end-tag-name-state))
928  (t
929  (push-token* self :characters "</")
930  (html5-stream-unget stream data)
931  (setf state :script-data-state)))))
932 
933 (defstate :script-data-end-tag-name-state (stream state temporary-buffer current-token)
934  (let ((appropriate (and current-token
935  (string-equal (getf current-token :name)
936  temporary-buffer)))
937  (data (html5-stream-char stream)))
938  (cond ((and (find data +space-characters+)
939  appropriate)
940  (setf current-token (list :type :end-tag
941  :name temporary-buffer
942  :data '()
943  :self-closing nil))
944  (setf state :before-attribute-name-state))
945  ((and (eql data #\/)
946  appropriate)
947  (setf current-token (list :type :end-tag
948  :name temporary-buffer
949  :data '()
950  :self-closing nil))
951  (setf state :self-closing-start-tag-state))
952  ((and (eql data #\>)
953  appropriate)
954  (setf current-token (list :type :end-tag
955  :name temporary-buffer
956  :data '()
957  :self-closing nil))
958  (emit-current-token self)
959  (setf state :data-state))
960  ((ascii-letter-p data)
961  (nconcatf temporary-buffer data))
962  (t
963  (push-token* self :characters "</" temporary-buffer)
964  (html5-stream-unget stream data)
965  (setf state :script-data-state)))))
966 
967 (defstate :script-data-escape-start-state (stream state)
968  (let ((data (html5-stream-char stream)))
969  (cond ((eql data #\-)
970  (push-token* self :characters "-")
971  (setf state :script-data-escape-start-dash-state))
972  (t
973  (html5-stream-unget stream data)
974  (setf state :script-data-state)))))
975 
976 (defstate :script-data-escape-start-dash-state (stream state)
977  (let ((data (html5-stream-char stream)))
978  (cond ((eql data #\-)
979  (push-token* self :characters "-")
980  (setf state :script-data-escaped-dash-dash-state))
981  (t
982  (html5-stream-unget stream data)
983  (setf state :script-data-state)))))
984 
985 (defstate :script-data-escaped-state (stream state)
986  (let ((data (html5-stream-char stream)))
987  (cond ((eql data #\-)
988  (push-token* self :characters "-")
989  (setf state :script-data-escaped-dash-state))
990  ((eql data #\<)
991  (setf state :script-data-escaped-less-than-sign-state))
992  ((eql data #\u0000)
993  (push-token self '(:type :parse-error :data :invalid-codepoint))
994  (push-token* self :characters #\uFFFD))
995  ((eql data +eof+)
996  (setf state :data-state))
997  (t
998  (push-token* self :characters data (html5-stream-chars-until stream '(#\< #\- #\u0000)))))))
999 
1000 (defstate :script-data-escaped-dash-state (stream state)
1001  (let ((data (html5-stream-char stream)))
1002  (cond ((eql data #\-)
1003  (push-token* self :characters "-")
1004  (setf state :script-data-escaped-dash-dash-state))
1005  ((eql data #\<)
1006  (setf state :script-data-escaped-less-than-sign-state))
1007  ((eql data #\u0000)
1008  (push-token self '(:type :parse-error :data :invalid-codepoint))
1009  (push-token* self :characters #\uFFFD)
1010  (setf state :script-data-escaped-state))
1011  ((eql data +eof+)
1012  (setf state :data-state))
1013  (t
1014  (push-token* self :characters data (html5-stream-chars-until stream '(#\< #\- #\u0000)))
1015  (setf state :script-data-escaped-state)))))
1016 
1017 (defstate :script-data-escaped-dash-dash-state (stream state)
1018  (let ((data (html5-stream-char stream)))
1019  (cond ((eql data #\-)
1020  (push-token* self :characters "-"))
1021  ((eql data #\<)
1022  (setf state :script-data-escaped-less-than-sign-state))
1023  ((eql data #\>)
1024  (push-token* self :characters ">")
1025  (setf state :script-data-state))
1026  ((eql data #\u0000)
1027  (push-token self '(:type :parse-error :data :invalid-codepoint))
1028  (push-token* self :characters #\uFFFD)
1029  (setf state :script-data-escaped-state))
1030  ((eql data +eof+)
1031  (setf state :data-state))
1032  (t
1033  (push-token* self :characters data (html5-stream-chars-until stream '(#\< #\- #\u0000)))
1034  (setf state :script-data-escaped-state)))))
1035 
1036 (defstate :script-data-escaped-less-than-sign-state (stream state temporary-buffer)
1037  (let ((data (html5-stream-char stream)))
1038  (cond ((eql data #\/)
1039  (setf temporary-buffer (make-growable-string))
1040  (setf state :script-data-escaped-end-tag-open-state))
1041  ((ascii-letter-p data)
1042  (push-token* self :characters "<" data)
1043  (setf temporary-buffer (ascii-upper-2-lower (string data)))
1044  (setf state :script-data-double-escape-start-state))
1045  (t
1046  (push-token* self :characters "<")
1047  (html5-stream-unget stream data)
1048  (setf state :script-data-escaped-state)))))
1049 
1050 (defstate :script-data-escaped-end-tag-open-state (stream state temporary-buffer)
1051  (let ((data (html5-stream-char stream)))
1052  (cond ((ascii-letter-p data)
1053  (setf temporary-buffer (string data))
1054  (setf state :script-data-escaped-end-tag-name-state))
1055  (t
1056  (push-token* self :characters "</")
1057  (html5-stream-unget stream data)
1058  (setf state :script-data-escaped-state)))))
1059 
1060 (defstate :script-data-escaped-end-tag-name-state (stream state temporary-buffer current-token)
1061  (let ((appropriate (and current-token
1062  (string-equal (getf current-token :name)
1063  temporary-buffer)))
1064  (data (html5-stream-char stream)))
1065  (cond ((and (find data +space-characters+)
1066  appropriate)
1067  (setf current-token (list :type :end-tag
1068  :name temporary-buffer
1069  :data '()
1070  :self-closing nil))
1071  (setf state :before-attribute-name-state))
1072  ((and (eql data #\/)
1073  appropriate)
1074  (setf current-token (list :type :end-tag
1075  :name temporary-buffer
1076  :data '()
1077  :self-closing nil))
1078  (setf state :self-closing-start-tag-state))
1079  ((and (eql data #\>)
1080  appropriate)
1081  (setf current-token (list :type :end-tag
1082  :name temporary-buffer
1083  :data '()
1084  :self-closing nil))
1085  (emit-current-token self)
1086  (setf state :data-state))
1087  ((ascii-letter-p data)
1088  (nconcatf temporary-buffer data))
1089  (t
1090  (push-token* self :characters "</" temporary-buffer)
1091  (html5-stream-unget stream data)
1092  (setf state :script-data-escaped-state)))))
1093 
1094 (defstate :script-data-double-escape-start-state (stream state temporary-buffer)
1095  (let ((data (html5-stream-char stream)))
1096  (cond ((or (find data +space-characters+)
1097  (find data '(#\/ #\>)))
1098  (push-token* self :characters data)
1099  (if (string= (string-downcase temporary-buffer) "script")
1100  (setf state :script-data-double-escaped-state)
1101  (setf state :script-data-escaped-state)))
1102  ((ascii-letter-p data)
1103  (push-token* self :characters data)
1104  (nconcatf temporary-buffer (string data)))
1105  (t
1106  (html5-stream-unget stream data)
1107  (setf state :script-data-escaped-state)))))
1108 
1109 (defstate :script-data-double-escaped-state (stream state)
1110  (let ((data (html5-stream-char stream)))
1111  (cond ((eql data #\-)
1112  (push-token* self :characters "-")
1113  (setf state :script-data-double-escaped-dash-state))
1114  ((eql data #\<)
1115  (push-token* self :characters "<")
1116  (setf state :script-data-double-escaped-less-than-sign-state))
1117  ((eql data #\u0000)
1118  (push-token self '(:type :parse-error :data :invalid-codepoint))
1119  (push-token* self :characters #\uFFFD))
1120  ((eql data +eof+)
1121  (push-token self '(:type :parse-error :data :eof-in-script-in-script))
1122  (setf state :data-state))
1123  (t
1124  (push-token* self :characters data)))))
1125 
1126 (defstate :script-data-double-escaped-dash-state (stream state)
1127  (let ((data (html5-stream-char stream)))
1128  (cond ((eql data #\-)
1129  (push-token* self :characters "-")
1130  (setf state :script-data-double-escaped-dash-dash-state))
1131  ((eql data #\<)
1132  (push-token* self :characters "<")
1133  (setf state :script-data-double-escaped-less-than-sign-state))
1134  ((eql data #\u0000)
1135  (push-token self '(:type :parse-error :data :invalid-codepoint))
1136  (push-token* self :characters #\uFFFD)
1137  (setf state :script-data-double-escaped-state))
1138  ((eql data +eof+)
1139  (push-token self '(:type :parse-error :data :eof-in-script-in-script))
1140  (setf state :data-state))
1141  (t
1142  (push-token* self :characters data)
1143  (setf state :script-data-double-escaped-state)))))
1144 
1145 ;; FIXME: Incorrectly named in Python code: scriptDataDoubleEscapedDashState (same the one above)
1146 (defstate :script-data-double-escaped-dash-dash-state (stream state)
1147  (let ((data (html5-stream-char stream)))
1148  (cond ((eql data #\-)
1149  (push-token* self :characters "-")
1150  (setf state :script-data-double-escaped-dash-dash-state))
1151  ((eql data #\<)
1152  (push-token* self :characters "<")
1153  (setf state :script-data-double-escaped-less-than-sign-state))
1154  ((eql data #\>)
1155  (push-token* self :characters ">")
1156  (setf state :script-data-state))
1157  ((eql data #\u0000)
1158  (push-token self '(:type :parse-error :data :invalid-codepoint))
1159  (push-token* self :characters #\uFFFD)
1160  (setf state :script-data-double-escaped-state))
1161  ((eql data +eof+)
1162  (push-token self '(:type :parse-error :data :eof-in-script-in-script))
1163  (setf state :data-state))
1164  (t
1165  (push-token* self :characters data)
1166  (setf state :script-data-double-escaped-state)))))
1167 
1168 (defstate :script-data-double-escaped-less-than-sign-state (stream state temporary-buffer)
1169  (let ((data (html5-stream-char stream)))
1170  (cond ((eql data #\/)
1171  (push-token* self :characters "/")
1172  (setf temporary-buffer (make-growable-string))
1173  (setf state :script-data-double-escape-end-state))
1174  (t
1175  (html5-stream-unget stream data)
1176  (setf state :script-data-double-escaped-state)))))
1177 
1178 (defstate :script-data-double-escape-end-state (stream state temporary-buffer)
1179  (let ((data (html5-stream-char stream)))
1180  (cond ((or (find data +space-characters+)
1181  (find data '(#\/ #\>)))
1182  (push-token* self :characters data)
1183  (if (string= (string-downcase temporary-buffer) "script")
1184  (setf state :script-data-escaped-state)
1185  (setf state :script-data-double-escaped-state)))
1186  ((ascii-letter-p data)
1187  (push-token* self :characters data)
1188  (nconcatf temporary-buffer data))
1189  (t
1190  (html5-stream-unget stream data)
1191  (setf state :script-data-double-escaped-state)))))
1192 
1193 (defstate :before-attribute-name-state (stream state current-token)
1194  (let ((data (html5-stream-char stream)))
1195  (cond ((find data +space-characters+)
1196  (html5-stream-chars-until stream +space-characters+ t))
1197  ((ascii-letter-p data)
1198  (add-attribute current-token data)
1199  (setf state :attribute-name-state))
1200  ((eql data #\>)
1201  (emit-current-token self))
1202  ((eql data #\/)
1203  (setf state :self-closing-start-tag-state))
1204  ((find data '(#\' #\" #\= #\<))
1205  (push-token self '(:type :parse-error :data :invalid-character-in-attribute-name))
1206  (add-attribute current-token data)
1207  (setf state :attribute-name-state))
1208  ((eql data #\u0000)
1209  (push-token self '(:type :parse-error :data :invalid-codepoint))
1210  (add-attribute current-token #\uFFFD)
1211  (setf state :attribute-name-state))
1212  ((eql data +eof+)
1213  (push-token self '(:type :parse-error :data :expected-attribute-name-but-got-eof))
1214  (setf state :data-state))
1215  (t
1216  (add-attribute current-token data)
1217  (setf state :attribute-name-state)))))
1218 
1219 (defstate :attribute-name-state (stream state current-token lowercase-attr-name)
1220  (let ((data (html5-stream-char stream))
1221  (leaving-this-state t)
1222  (emit-token nil))
1223  (cond ((eql data #\=)
1224  (setf state :before-attribute-value-state))
1225  ((ascii-letter-p data)
1226  (add-to-attr-name current-token data
1227  (html5-stream-chars-until stream +ascii-letters+ t))
1228  (setf leaving-this-state nil))
1229  ((eql data #\>)
1230  ;; XXX If we emit here the attributes are converted to a dict
1231  ;; without being checked and when the code below runs we error
1232  ;; because data is a dict not a list
1233  (setf emit-token t))
1234  ((find data +space-characters+)
1235  (setf state :after-attribute-name-state))
1236  ((eql data #\/)
1237  (setf state :self-closing-start-tag-state))
1238  ((eql data #\u0000)
1239  (push-token self '(:type :parse-error :data :invalid-codepoint))
1240  (add-to-attr-name current-token #\uFFFD)
1241  (setf leaving-this-state nil))
1242  ((find data '(#\' #\" #\<))
1243  (push-token self '(:type :parse-error :data :invalid-character-in-attribute-name))
1244  (add-to-attr-name current-token data)
1245  (setf leaving-this-state nil))
1246  ((eql data +eof+)
1247  (push-token self '(:type :parse-error :data :eof-in-attribute-name))
1248  (setf state :data-state))
1249  (t
1250  (add-to-attr-name current-token data)
1251  (setf leaving-this-state nil)))
1252  (when leaving-this-state
1253  ;; Attributes are not dropped at this stage. That happens when the
1254  ;; start tag token is emitted so values can still be safely appended
1255  ;; to attributes, but we do want to report the parse error in time.
1256  (when lowercase-attr-name
1257  (setf (caar (last (getf current-token :data)))
1258  (ascii-upper-2-lower (caar (last (getf current-token :data))))))
1259  (loop for (name . value) in (butlast (getf current-token :data)) do
1260  (when (string= (caar (last (getf current-token :data))) name)
1261  (push-token self '(:type :parse-error :data :duplicate-attribute))
1262  (return)))
1263  ;; XXX Fix for above XXX
1264  (when emit-token
1265  (emit-current-token self)))))
1266 
1267 (defstate :after-attribute-name-state (stream state current-token)
1268  (let ((data (html5-stream-char stream)))
1269  (cond ((find data +space-characters+)
1270  (html5-stream-chars-until stream +space-characters+ t))
1271  ((eql data #\=)
1272  (setf state :before-attribute-value-state))
1273  ((eql data #\>)
1274  (emit-current-token self))
1275  ((ascii-letter-p data)
1276  (add-attribute current-token data)
1277  (setf state :attribute-name-state))
1278  ((eql data #\/)
1279  (setf state :self-closing-start-tag-state))
1280  ((eql data #\u0000)
1281  (push-token self '(:type :parse-error :data :invalid-codepoint))
1282  (add-attribute current-token #\uFFFD)
1283  (setf state :attribute-name-state))
1284  ((find data '(#\' #\" #\<))
1285  (push-token self '(:type :parse-error :data :invalid-character-after-attribute-name))
1286  (add-attribute current-token data)
1287  (setf state :attribute-name-state))
1288  ((eql data +eof+)
1289  (push-token self '(:type :parse-error :data :expected-end-of-tag-but-got-eof))
1290  (setf state :data-state))
1291  (t
1292  (add-attribute current-token data)
1293  (setf state :attribute-name-state)))))
1294 
1295 (defstate :before-attribute-value-state (stream state current-token)
1296  (let ((data (html5-stream-char stream)))
1297  (cond ((find data +space-characters+)
1298  (html5-stream-chars-until stream +space-characters+ t))
1299  ((eql data #\")
1300  (setf state :attribute-value-double-quoted-state))
1301  ((eql data #\&)
1302  (setf state :attribute-value-un-quoted-state)
1303  (html5-stream-unget stream data))
1304  ((eql data #\')
1305  (setf state :attribute-value-single-quoted-state))
1306  ((eql data #\>)
1307  (push-token self '(:type :parse-error :data :expected-attribute-value-but-got-right-bracket))
1308  (emit-current-token self))
1309  ((eql data #\u0000)
1310  (push-token self '(:type :parse-error :data :invalid-codepoint))
1311  (add-to-attr-value current-token #\uFFFD)
1312  (setf state :attribute-value-un-quoted-state))
1313  ((find data '(#\= #\< #\`))
1314  (push-token self '(:type :parse-error :data :equals-in-unquoted-attribute-value))
1315  (add-to-attr-value current-token data)
1316  (setf state :attribute-value-un-quoted-state))
1317  ((eql data +eof+)
1318  (push-token self '(:type :parse-error :data :expected-attribute-value-but-got-eof))
1319  (setf state :data-state))
1320  (t
1321  (add-to-attr-value current-token data)
1322  (setf state :attribute-value-un-quoted-state)))))
1323 
1324 (defstate :attribute-value-double-quoted-state (stream state current-token)
1325  (let ((data (html5-stream-char stream)))
1326  (cond ((eql data #\")
1327  (setf state :after-attribute-value-state))
1328  ((eql data #\&)
1329  (process-entity-in-attribute self :allowed-char #\"))
1330  ((eql data #\u0000)
1331  (push-token self '(:type :parse-error :data :invalid-codepoint))
1332  (add-to-attr-value current-token #\uFFFD))
1333  ((eql data +eof+)
1334  (push-token self '(:type :parse-error :data :eof-in-attribute-value-double-quote))
1335  (setf state :data-state))
1336  (t
1337  (add-to-attr-value current-token
1338  data
1339  (html5-stream-chars-until stream '(#\" #\&)))))))
1340 
1341 (defstate :attribute-value-single-quoted-state (stream state current-token)
1342  (let ((data (html5-stream-char stream)))
1343  (cond ((eql data #\')
1344  (setf state :after-attribute-value-state))
1345  ((eql data #\&)
1346  (process-entity-in-attribute self :allowed-char #\'))
1347  ((eql data #\u0000)
1348  (push-token self '(:type :parse-error :data :invalid-codepoint))
1349  (add-to-attr-value current-token #\uFFFD))
1350  ((eql data +eof+)
1351  (push-token self '(:type :parse-error :data :eof-in-attribute-value-single-quote))
1352  (setf state :data-state))
1353  (t
1354  (add-to-attr-value current-token
1355  data
1356  (html5-stream-chars-until stream '(#\' #\&)))))))
1357 
1358 (defstate :attribute-value-un-quoted-state (stream state current-token)
1359  (let ((data (html5-stream-char stream)))
1360  (cond ((find data +space-characters+)
1361  (setf state :before-attribute-name-state))
1362  ((eql data #\&)
1363  (process-entity-in-attribute self :allowed-char #\>))
1364  ((eql data #\>)
1365  (emit-current-token self))
1366  ((find data '(#\" #\' #\= #\< #\`))
1367  (push-token self '(:type :parse-error :data :unexpected-character-in-unquoted-attribute-value))
1368  (add-to-attr-value current-token data))
1369  ((eql data #\u0000)
1370  (push-token self '(:type :parse-error :data :invalid-codepoint))
1371  (add-to-attr-value current-token #\uFFFD))
1372  ((eql data +eof+)
1373  (push-token self '(:type :parse-error :data :eof-in-attribute-value-no-quotes))
1374  (setf state :data-state))
1375  (t
1376  (add-to-attr-value current-token
1377  data
1378  (html5-stream-chars-until stream `(#\& #\> #\" #\' #\= #\< #\`
1379  ,@+space-characters+)))))))
1380 
1381 (defstate :after-attribute-value-state (stream state current-token)
1382  (let ((data (html5-stream-char stream)))
1383  (cond ((find data +space-characters+)
1384  (setf state :before-attribute-name-state))
1385  ((eql data #\>)
1386  (emit-current-token self))
1387  ((eql data #\/)
1388  (setf state :self-closing-start-tag-state))
1389  ((eql data +eof+)
1390  (push-token self '(:type :parse-error :data :unexpected-EOF-after-attribute-value))
1391  (html5-stream-unget stream data)
1392  (setf state :data-state))
1393  (t
1394  (push-token self '(:type :parse-error :data :unexpected-character-after-attribute-value))
1395  (html5-stream-unget stream data)
1396  (setf state :before-attribute-name-state)))))
1397 
1398 (defstate :self-closing-start-tag-state (stream state current-token)
1399  (let ((data (html5-stream-char stream)))
1400  (cond ((eql data #\>)
1401  (setf (getf current-token :self-closing) t)
1402  (emit-current-token self))
1403  ((eql data +eof+)
1404  (push-token self '(:type :parse-error :data :unexpected-EOF-after-solidus-in-tag))
1405  (html5-stream-unget stream data)
1406  (setf state :data-state))
1407  (t
1408  (push-token self '(:type :parse-error :data :unexpected-character-after-soldius-in-tag))
1409  (html5-stream-unget stream data)
1410  (setf state :before-attribute-name-state)))))
1411 
1412 (defstate :bogus-comment-state (stream state current-token)
1413  ;; Make a new comment token and give it as value all the characters
1414  ;; until the first > or EOF (charsUntil checks for EOF automatically)
1415  ;; and emit it.
1416  (let ((data (html5-stream-chars-until stream '(#\>))))
1417  (setf data (substitute #\uFFFD #\u0000 data))
1418  (push-token* self :comment data)
1419  ;; Eat the character directly after the bogus comment which is either a
1420  ;; ">" or an EOF.
1421  (html5-stream-char stream)
1422  (setf state :data-state)))
1423 
1424 (defstate :markup-declaration-open-state (stream state current-token
1425  cdata-switch-helper)
1426  (let ((char-stack (make-array 1
1427  :initial-element (html5-stream-char stream)
1428  :fill-pointer 1
1429  :adjustable t)))
1430  (cond ((eql (aref char-stack (1- (length char-stack))) #\-)
1431  (vector-push-extend (html5-stream-char stream) char-stack)
1432  (when (eql (aref char-stack (1- (length char-stack))) #\-)
1433  (setf current-token (list :type :comment :data ""))
1434  (setf state :comment-start-state)
1435  (return t)))
1436  ((find (aref char-stack (1- (length char-stack))) '(#\d #\D))
1437  (let ((matched t))
1438  (loop for expected in '((#\o #\O) (#\c #\C) (#\t #\T) (#\y #\Y) (#\p #\P) (#\e #\E)) do
1439  (vector-push-extend (html5-stream-char stream) char-stack)
1440  (unless (find (aref char-stack (1- (length char-stack))) expected)
1441  (setf matched nil)
1442  (return)))
1443  (when matched
1444  (setf current-token (list :type :doctype
1445  :name ""
1446  :public-id nil
1447  :system-id nil
1448  :correct t))
1449  (setf state :doctype-state)
1450  (return t))))
1451  ((and (eql (aref char-stack (1- (length char-stack))) #\[)
1452  (funcall cdata-switch-helper))
1453  (let ((matched t))
1454  (loop for expected across "CDATA[" do
1455  (vector-push-extend (html5-stream-char stream) char-stack)
1456  (unless (eql (aref char-stack (1- (length char-stack))) expected)
1457  (setf matched nil)
1458  (return)))
1459  (when matched
1460  (setf state :cdata-section-state)
1461  (return t)))))
1462  (push-token self '(:type :parse-error :data :expected-dashes-or-doctype))
1463  (loop while (plusp (length char-stack)) do
1464  (html5-stream-unget stream (vector-pop char-stack)))
1465  (setf state :bogus-comment-state)))
1466 
1467 (defstate :comment-start-state (stream state current-token)
1468  (let ((data (html5-stream-char stream)))
1469  (cond ((eql data #\-)
1470  (setf state :comment-start-dash-state))
1471  ((eql data #\u0000)
1472  (push-token self '(:type :parse-error :data :invalid-codepoint))
1473  (add-to current-token :data #\uFFFD))
1474  ((eql data #\>)
1475  (push-token self '(:type :parse-error :data :incorrect-comment))
1476  (push-token self current-token)
1477  (setf state :data-state))
1478  ((eql data +eof+)
1479  (push-token self '(:type :parse-error :data :eof-in-comment))
1480  (push-token self current-token)
1481  (setf state :data-state))
1482  (t
1483  (add-to current-token :data data)
1484  (setf state :comment-state)))))
1485 
1486 (defstate :comment-start-dash-state (stream state current-token)
1487  (let ((data (html5-stream-char stream)))
1488  (cond ((eql data #\-)
1489  (setf state :comment-end-state))
1490  ((eql data #\u0000)
1491  (push-token self '(:type :parse-error :data :invalid-codepoint))
1492  (add-to current-token :data "-" #\uFFFD))
1493  ((eql data #\>)
1494  (push-token self '(:type :parse-error :data :incorrect-comment))
1495  (push-token self current-token)
1496  (setf state :data-state))
1497  ((eql data +eof+)
1498  (push-token self '(:type :parse-error :data :eof-in-comment))
1499  (push-token self current-token)
1500  (setf state :data-state))
1501  (t
1502  (add-to current-token :data "-" data)
1503  (setf state :comment-state)))))
1504 
1505 (defstate :comment-state (stream state current-token)
1506  (let ((data (html5-stream-char stream)))
1507  (cond ((eql data #\-)
1508  (setf state :comment-end-dash-state))
1509  ((eql data #\u0000)
1510  (push-token self '(:type :parse-error :data :invalid-codepoint))
1511  (add-to current-token :data #\uFFFD))
1512  ((eql data +eof+)
1513  (push-token self '(:type :parse-error :data :eof-in-comment))
1514  (push-token self current-token)
1515  (setf state :data-state))
1516  (t
1517  (add-to current-token :data data
1518  (html5-stream-chars-until stream '(#\- #\u0000)))))))
1519 
1520 (defstate :comment-end-dash-state (stream state current-token)
1521  (let ((data (html5-stream-char stream)))
1522  (cond ((eql data #\-)
1523  (setf state :comment-end-state))
1524  ((eql data #\u0000)
1525  (push-token self '(:type :parse-error :data :invalid-codepoint))
1526  (add-to current-token :data "-" #\uFFFD))
1527  ((eql data +eof+)
1528  (push-token self '(:type :parse-error :data :eof-in-comment-end-dash))
1529  (push-token self current-token)
1530  (setf state :data-state))
1531  (t
1532  (add-to current-token :data "-" data)
1533  (setf state :comment-state)))))
1534 
1535 (defstate :comment-end-state (stream state current-token)
1536  (let ((data (html5-stream-char stream)))
1537  (cond ((eql data #\>)
1538  (push-token self current-token)
1539  (setf state :data-state))
1540  ((eql data #\u0000)
1541  (push-token self '(:type :parse-error :data :invalid-codepoint))
1542  (add-to current-token :data "--" #\uFFFD)
1543  (setf state :comment-state))
1544  ((eql data #\!)
1545  (push-token self '(:type :parse-error :data :unexpected-bang-after-double-dash-in-comment))
1546  (setf state :comment-end-bang-state))
1547  ((eql data #\-)
1548  (push-token self '(:type :parse-error :data :unexpected-dash-after-double-dash-in-comment))
1549  (add-to current-token :data data))
1550  ((eql data +eof+)
1551  (push-token self '(:type :parse-error :data :eof-in-comment-double-dash))
1552  (push-token self current-token)
1553  (setf state :data-state))
1554  (t
1555  ;; XXX
1556  (push-token self '(:type :parse-error :data :unexpected-char-in-comment))
1557  (add-to current-token :data "--" data)
1558  (setf state :comment-state)))))
1559 
1560 (defstate :comment-end-bang-state (stream state current-token)
1561  (let ((data (html5-stream-char stream)))
1562  (cond ((eql data #\>)
1563  (push-token self current-token)
1564  (setf state :data-state))
1565  ((eql data #\-)
1566  (add-to current-token :data "--!")
1567  (setf state :comment-end-dash-state))
1568  ((eql data #\u0000)
1569  (push-token self '(:type :parse-error :data :invalid-codepoint))
1570  (add-to current-token :data "--!" #\uFFFD)
1571  (setf state :comment-state))
1572  ((eql data +eof+)
1573  (push-token self '(:type :parse-error :data :eof-in-comment-end-bang-state))
1574  (push-token self current-token)
1575  (setf state :data-state))
1576  (t
1577  (add-to current-token :data "--!" data)
1578  (setf state :comment-state)))))
1579 
1580 (defstate :doctype-state (stream state current-token)
1581  (let ((data (html5-stream-char stream)))
1582  (cond ((find data +space-characters+)
1583  (setf state :before-doctype-name-state))
1584  ((eql data +eof+)
1585  (push-token self '(:type :parse-error :data :expected-doctype-name-but-got-eof))
1586  (setf (getf current-token :correct) nil)
1587  (push-token self current-token)
1588  (setf state :data-state))
1589  (t
1590  (push-token self '(:type :parse-error :data :need-space-after-doctype))
1591  (html5-stream-unget stream data)
1592  (setf state :before-doctype-name-state)))))
1593 
1594 (defstate :before-doctype-name-state (stream state current-token)
1595  (let ((data (html5-stream-char stream)))
1596  (cond ((find data +space-characters+)
1597  ;; pass
1598  )
1599  ((eql data #\>)
1600  (push-token self '(:type :parse-error :data :expected-doctype-name-but-got-right-bracket))
1601  (setf (getf current-token :correct) nil)
1602  (push-token self current-token)
1603  (setf state :data-state))
1604  ((eql data #\u0000)
1605  (push-token self '(:type :parse-error :data :invalid-codepoint))
1606  (add-to current-token :name #\uFFFD)
1607  (setf state :doctype-name-state))
1608  ((eql data +eof+)
1609  (push-token self '(:type :parse-error :data :expected-doctype-name-but-got-eof))
1610  (setf (getf current-token :correct) nil)
1611  (push-token self current-token)
1612  (setf state :data-state))
1613  (t
1614  (setf (getf current-token :name) (string data))
1615  (setf state :doctype-name-state)))))
1616 
1617 (defstate :doctype-name-state (stream state current-token)
1618  (let ((data (html5-stream-char stream)))
1619  (cond ((find data +space-characters+)
1620  (setf (getf current-token :name) (ascii-upper-2-lower (getf current-token :name)))
1621  (setf state :after-doctype-name-state))
1622  ((eql data #\>)
1623  (setf (getf current-token :name) (ascii-upper-2-lower (getf current-token :name)))
1624  (push-token self current-token)
1625  (setf state :data-state))
1626  ((eql data #\u0000)
1627  (push-token self '(:type :parse-error :data :invalid-codepoint))
1628  (add-to current-token :name #\uFFFD)
1629  (setf state :doctype-name-state))
1630  ((eql data +eof+)
1631  (push-token self '(:type :parse-error :data :eof-in-doctype-name))
1632  (setf (getf current-token :correct) nil)
1633  (setf (getf current-token :name) (ascii-upper-2-lower (getf current-token :name)))
1634  (push-token self current-token)
1635  (setf state :data-state))
1636  (t
1637  (add-to current-token :name data)))))
1638 
1639 (defstate :after-doctype-name-state (stream state current-token)
1640  (let ((data (html5-stream-char stream)))
1641  (cond ((find data +space-characters+)
1642  ;; pass
1643  )
1644  ((eql data #\>)
1645  (push-token self current-token)
1646  (setf state :data-state))
1647  ((eql data +eof+)
1648  (setf (getf current-token :correct) nil)
1649  (html5-stream-unget stream data)
1650  (push-token self '(:type :parse-error :data :eof-in-doctype))
1651  (push-token self current-token)
1652  (setf state :data-state))
1653  (t
1654  (cond ((find data '(#\p #\P))
1655  (let ((matched t))
1656  (loop for expected in '((#\u #\U) (#\b #\B) (#\l #\L) (#\i #\I) (#\c #\C)) do
1657  (setf data (html5-stream-char stream))
1658  (unless (find data expected)
1659  (setf matched nil)
1660  (return)))
1661  (when matched
1662  (setf state :after-doctype-public-keyword-state)
1663  (return t))))
1664  ((find data '(#\s #\S))
1665  (let ((matched t))
1666  (loop for expected in '((#\y #\Y) (#\s #\S) (#\t #\T) (#\e #\E) (#\m #\M)) do
1667  (setf data (html5-stream-char stream))
1668  (unless (find data expected)
1669  (setf matched nil)
1670  (return)))
1671  (when matched
1672  (setf state :after-doctype-system-keyword-state)
1673  (return t)))))
1674  ;; All the characters read before the current 'data' will be
1675  ;; [a-zA-Z], so they're garbage in the bogus doctype and can be
1676  ;; discarded; only the latest character might be '>' or EOF
1677  ;; and needs to be ungetted
1678  (html5-stream-unget stream data)
1679  (push-token self `(:type :parse-error :data :expected-space-or-right-bracket-in-doctype
1680  :datavars (:data ,data)))
1681  (setf (getf current-token :correct) nil)
1682  (setf state :bogus-doctype-state)))))
1683 
1684 (defstate :after-doctype-public-keyword-state (stream state current-token)
1685  (let ((data (html5-stream-char stream)))
1686  (cond ((find data +space-characters+)
1687  (setf state :before-doctype-public-identifier-state))
1688  ((find data '(#\' #\"))
1689  (push-token self '(:type :parse-error :data :unexpected-char-in-doctype))
1690  (html5-stream-unget stream data)
1691  (setf state :before-doctype-public-identifier-state))
1692  ((eql data +eof+)
1693  (push-token self '(:type :parse-error :data :eof-in-doctype))
1694  (setf (getf current-token :correct) nil)
1695  (push-token self current-token)
1696  (setf state :data-state))
1697  (t
1698  (html5-stream-unget stream data)
1699  (setf state :before-doctype-public-identifier-state)))))
1700 
1701 (defstate :before-doctype-public-identifier-state (stream state current-token)
1702  (let ((data (html5-stream-char stream)))
1703  (cond ((find data +space-characters+)
1704  ;; pass
1705  )
1706  ((eql data #\")
1707  (setf (getf current-token :public-id) "")
1708  (setf state :doctype-public-identifier-double-quoted-state))
1709  ((eql data #\')
1710  (setf (getf current-token :public-id) "")
1711  (setf state :doctype-public-identifier-single-quoted-state))
1712  ((eql data #\>)
1713  (push-token self '(:type :parse-error :data :unexpected-end-of-doctype))
1714  (setf (getf current-token :correct) nil)
1715  (push-token self current-token)
1716  (setf state :data-state))
1717  ((eql data +eof+)
1718  (push-token self '(:type :parse-error :data :eof-in-doctype))
1719  (setf (getf current-token :correct) nil)
1720  (push-token self current-token)
1721  (setf state :data-state))
1722  (t
1723  (push-token self '(:type :parse-error :data :unexpected-char-in-doctype))
1724  (setf (getf current-token :correct) nil)
1725  (setf state :bogus-doctype-state)))))
1726 
1727 (defstate :doctype-public-identifier-double-quoted-state (stream state current-token)
1728  (let ((data (html5-stream-char stream)))
1729  (cond ((eql data #\")
1730  (setf state :after-doctype-public-identifier-state))
1731  ((eql data #\u0000)
1732  (push-token self '(:type :parse-error :data :invalid-codepoint))
1733  (add-to current-token :public-id #\uFFFD))
1734  ((eql data #\>)
1735  (push-token self '(:type :parse-error :data :unexpected-end-of-doctype))
1736  (setf (getf current-token :correct) nil)
1737  (push-token self current-token)
1738  (setf state :data-state))
1739  ((eql data +eof+)
1740  (push-token self '(:type :parse-error :data :eof-in-doctype))
1741  (setf (getf current-token :correct) nil)
1742  (push-token self current-token)
1743  (setf state :data-state))
1744  (t
1745  (add-to current-token :public-id data)))))
1746 
1747 (defstate :doctype-public-identifier-single-quoted-state (stream state current-token)
1748  (let ((data (html5-stream-char stream)))
1749  (cond ((eql data #\')
1750  (setf state :after-doctype-public-identifier-state))
1751  ((eql data #\u0000)
1752  (push-token self '(:type :parse-error :data :invalid-codepoint))
1753  (add-to current-token :public-id #\uFFFD))
1754  ((eql data #\>)
1755  (push-token self '(:type :parse-error :data :unexpected-end-of-doctype))
1756  (setf (getf current-token :correct) nil)
1757  (push-token self current-token)
1758  (setf state :data-state))
1759  ((eql data +eof+)
1760  (push-token self '(:type :parse-error :data :eof-in-doctype))
1761  (setf (getf current-token :correct) nil)
1762  (push-token self current-token)
1763  (setf state :data-state))
1764  (t
1765  (add-to current-token :public-id data)))))
1766 
1767 (defstate :after-doctype-public-identifier-state (stream state current-token)
1768  (let ((data (html5-stream-char stream)))
1769  (cond ((find data +space-characters+)
1770  (setf state :between-doctype-public-and-system-identifiers-state))
1771  ((eql data #\>)
1772  (push-token self current-token)
1773  (setf state :data-state))
1774  ((eql data #\")
1775  (push-token self '(:type :parse-error :data :unexpected-char-in-doctype))
1776  (setf (getf current-token :system-id) "")
1777  (setf state :doctype-system-identifier-double-quoted-state))
1778  ((eql data #\')
1779  (push-token self '(:type :parse-error :data :unexpected-char-in-doctype))
1780  (setf (getf current-token :system-id) "")
1781  (setf state :doctype-system-identifier-single-quoted-state))
1782  ((eql data +eof+)
1783  (push-token self '(:type :parse-error :data :eof-in-doctype))
1784  (setf (getf current-token :correct) nil)
1785  (push-token self current-token)
1786  (setf state :data-state))
1787  (t
1788  (push-token self '(:type :parse-error :data :unexpected-char-in-doctype))
1789  (setf (getf current-token :correct) nil)
1790  (setf state :bogus-doctype-state)))))
1791 
1792 (defstate :between-doctype-public-and-system-identifiers-state (stream state current-token)
1793  (let ((data (html5-stream-char stream)))
1794  (cond ((find data +space-characters+)
1795  ;; pass
1796  )
1797  ((eql data #\>)
1798  (push-token self current-token)
1799  (setf state :data-state))
1800  ((eql data #\")
1801  (setf (getf current-token :system-id) "")
1802  (setf state :doctype-system-identifier-double-quoted-state))
1803  ((eql data #\')
1804  (setf (getf current-token :system-id) "")
1805  (setf state :doctype-system-identifier-single-quoted-state))
1806  ((eql data +eof+)
1807  (push-token self '(:type :parse-error :data :eof-in-doctype))
1808  (setf (getf current-token :correct) nil)
1809  (push-token self current-token)
1810  (setf state :data-state))
1811  (t
1812  (push-token self '(:type :parse-error :data :unexpected-char-in-doctype))
1813  (setf (getf current-token :correct) nil)
1814  (setf state :bogus-doctype-state)))))
1815 
1816 (defstate :after-doctype-system-keyword-state (stream state current-token)
1817  (let ((data (html5-stream-char stream)))
1818  (cond ((find data +space-characters+)
1819  (setf state :before-doctype-system-identifier-state))
1820  ((find data '(#\' #\"))
1821  (push-token self '(:type :parse-error :data :unexpected-char-in-doctype))
1822  (html5-stream-unget stream data)
1823  (setf state :before-doctype-system-identifier-state))
1824  ((eql data +eof+)
1825  (push-token self '(:type :parse-error :data :eof-in-doctype))
1826  (setf (getf current-token :correct) nil)
1827  (push-token self current-token)
1828  (setf state :data-state))
1829  (t
1830  (html5-stream-unget stream data)
1831  (setf state :before-doctype-system-identifier-state)))))
1832 
1833 (defstate :before-doctype-system-identifier-state (stream state current-token)
1834  (let ((data (html5-stream-char stream)))
1835  (cond ((find data +space-characters+)
1836  ;; pass
1837  )
1838  ((eql data #\")
1839  (setf (getf current-token :system-id) "")
1840  (setf state :doctype-system-identifier-double-quoted-state))
1841  ((eql data #\')
1842  (setf (getf current-token :system-id) "")
1843  (setf state :doctype-system-identifier-single-quoted-state))
1844  ((eql data #\>)
1845  (push-token self '(:type :parse-error :data :unexpected-end-of-doctype))
1846  (setf (getf current-token :correct) nil)
1847  (push-token self current-token)
1848  (setf state :data-state))
1849  ((eql data +eof+)
1850  (push-token self '(:type :parse-error :data :eof-in-doctype))
1851  (setf (getf current-token :correct) nil)
1852  (push-token self current-token)
1853  (setf state :data-state))
1854  (t
1855  (push-token self '(:type :parse-error :data :unexpected-char-in-doctype))
1856  (setf (getf current-token :correct) nil)
1857  (setf state :bogus-doctype-state)))))
1858 
1859 (defstate :doctype-system-identifier-double-quoted-state (stream state current-token)
1860  (let ((data (html5-stream-char stream)))
1861  (cond ((eql data #\")
1862  (setf state :after-doctype-system-identifier-state))
1863  ((eql data #\u0000)
1864  (push-token self '(:type :parse-error :data :invalid-codepoint))
1865  (add-to current-token :system-id #\uFFFD))
1866  ((eql data #\>)
1867  (push-token self '(:type :parse-error :data :unexpected-end-of-doctype))
1868  (setf (getf current-token :correct) nil)
1869  (push-token self current-token)
1870  (setf state :data-state))
1871  ((eql data +eof+)
1872  (push-token self '(:type :parse-error :data :eof-in-doctype))
1873  (setf (getf current-token :correct) nil)
1874  (push-token self current-token)
1875  (setf state :data-state))
1876  (t
1877  (add-to current-token :system-id data)))))
1878 
1879 (defstate :doctype-system-identifier-single-quoted-state (stream state current-token)
1880  (let ((data (html5-stream-char stream)))
1881  (cond ((eql data #\')
1882  (setf state :after-doctype-system-identifier-state))
1883  ((eql data #\u0000)
1884  (push-token self '(:type :parse-error :data :invalid-codepoint))
1885  (add-to current-token :system-id #\uFFFD))
1886  ((eql data #\>)
1887  (push-token self '(:type :parse-error :data :unexpected-end-of-doctype))
1888  (setf (getf current-token :correct) nil)
1889  (push-token self current-token)
1890  (setf state :data-state))
1891  ((eql data +eof+)
1892  (push-token self '(:type :parse-error :data :eof-in-doctype))
1893  (setf (getf current-token :correct) nil)
1894  (push-token self current-token)
1895  (setf state :data-state))
1896  (t
1897  (add-to current-token :system-id data)))))
1898 
1899 (defstate :after-doctype-system-identifier-state (stream state current-token)
1900  (let ((data (html5-stream-char stream)))
1901  (cond ((find data +space-characters+)
1902  ;; pass
1903  )
1904  ((eql data #\>)
1905  (push-token self current-token)
1906  (setf state :data-state))
1907  ((eql data +eof+)
1908  (push-token self '(:type :parse-error :data :eof-in-doctype))
1909  (setf (getf current-token :correct) nil)
1910  (push-token self current-token)
1911  (setf state :data-state))
1912  (t
1913  (push-token self '(:type :parse-error :data :unexpected-char-in-doctype))
1914  (setf state :bogus-doctype-state)))))
1915 
1916 (defstate :bogus-doctype-state (stream state current-token)
1917  (let ((data (html5-stream-char stream)))
1918  (cond ((eql data #\>)
1919  (push-token self current-token)
1920  (setf state :data-state))
1921  ((eql data +eof+)
1922  ;; XXX EMIT
1923  (html5-stream-unget stream data)
1924  (push-token self current-token)
1925  (setf state :data-state))
1926  (t
1927  ;; pass
1928  ))))
1929 
1930 (defstate :cdata-section-state (stream state current-token)
1931  (let ((data '()))
1932  (loop
1933  (push (html5-stream-chars-until stream '(#\])) data)
1934  (let ((char-stack '())
1935  (matched t))
1936  (loop for expected across "]]>" do
1937  (push (html5-stream-char stream) char-stack)
1938  (cond ((eql (car char-stack) +eof+)
1939  (pop char-stack)
1940  (setf data (append char-stack data))
1941  (return))
1942  ((not (eql (car char-stack) expected))
1943  (setf matched nil)
1944  (setf data (append char-stack data))
1945  (return))))
1946  (when matched
1947  (return))))
1948  (setf data (apply #'concatenate 'string (mapcar #'string (nreverse data))))
1949  ;; Deal with null here rather than in the parser
1950  (let ((null-count (count #\u0000 data)))
1951  (when (plusp null-count)
1952  (push-token self '(:type :parse-error :data :invalid-codepoint))
1953  (setf data (nsubstitute #\uFFFD #\u0000 data))))
1954  (when (plusp (length data))
1955  (push-token* self :characters data))
1956  (setf state :data-state)))
1957 
1958 ;;; simple-tree
1959 ;; A basic implementation of a DOM-core like thing
1960 
1961 (defclass node ()
1962  ((type :initform :node :allocation :class :reader node-type)
1963  (name :initarg :name :initform nil :reader node-name)
1964  (namespace :initarg :namespace :initform nil :reader node-namespace)
1965  (parent :initform nil :reader node-parent)
1966  (value :initform nil :initarg :value
1967  :accessor node-value)
1968  (child-nodes :initform nil :accessor %node-child-nodes)
1969  (last-child :initform nil :accessor last-child)))
1970 
1971 (defmethod (setf %node-child-nodes) :after (value (node node))
1972  (setf (last-child node) (last value)))
1973 
1974 (defclass document (node)
1975  ((type :initform :document :allocation :class)))
1976 
1977 (defclass document-fragment (document)
1978  ((type :initform :document-fragment :allocation :class)))
1979 
1980 (defclass document-type (node)
1981  ((type :initform :document-type :allocation :class)
1982  (public-id :initarg :public-id :reader node-public-id)
1983  (system-id :initarg :system-id :reader node-system-id)))
1984 
1985 (defclass text-node (node)
1986  ((type :initform :text :allocation :class)))
1987 
1988 (defclass element (node)
1989  ((type :initform :element :allocation :class)
1990  (attributes :initform nil :accessor %node-attributes)))
1991 
1992 (defclass comment-node (node)
1993  ((type :initform :comment :allocation :class)))
1994 
1995 ;;;
1996 ;;; Creating nodes
1997 ;;;
1998 
1999 (defun make-document ()
2000  (make-instance 'document))
2001 
2002 (defun make-fragment (document)
2003  (declare (ignore document))
2004  (make-instance 'document-fragment))
2005 
2006 (defun make-doctype (document name public-id system-id)
2007  (declare (ignore document))
2008  (make-instance 'document-type :name name :public-id public-id :system-id system-id))
2009 
2010 (defun make-comment (document data)
2011  (declare (ignore document))
2012  (make-instance 'comment-node :value data))
2013 
2014 (defun make-element (document name namespace)
2015  (declare (ignore document))
2016  (make-instance 'element :name name :namespace namespace))
2017 
2018 (defun make-text-node (document data)
2019  (declare (ignore document))
2020  (make-instance 'text-node :value data))
2021 
2022 ;;;
2023 ;;; Node methods
2024 ;;;
2025 
2026 (defun node-first-child (node)
2027  (car (%node-child-nodes node)))
2028 
2029 (defun node-last-child (node)
2030  (car (last-child node)))
2031 
2032 (defun node-previous-sibling (node)
2033  (loop for (this next) on (%node-child-nodes (node-parent node))
2034  when (eql next node) do (return this)))
2035 
2036 (defun node-next-sibling (node)
2037  (loop for (this next) on (%node-child-nodes (node-parent node))
2038  when (eql this node) do (return next)))
2039 
2040 (defun node-remove-child (node child)
2041  (setf (%node-child-nodes node)
2042  (remove child (%node-child-nodes node)))
2043  (setf (slot-value child 'parent) nil))
2044 
2045 (defun node-append-child (node child)
2046  (when (node-parent child)
2047  (node-remove-child (node-parent child) child))
2048  (setf (slot-value child 'parent) node)
2049  (if (%node-child-nodes node)
2050  (setf (last-child node)
2051  (push child (cdr (last-child node))))
2052  (setf (%node-child-nodes node)
2053  (list child)))
2054  (%node-child-nodes node))
2055 
2056 (defun node-insert-before (node child insert-before)
2057  (let ((child-nodes (%node-child-nodes node)))
2058  (setf (slot-value child 'parent) node)
2059  (labels ((insert-before (child-nodes)
2060  (cond ((endp child-nodes)
2061  (cons child nil))
2062  ((eql (car child-nodes) insert-before)
2063  (cons child child-nodes))
2064  (t (rplacd child-nodes (insert-before (cdr child-nodes)))))))
2065  (setf (%node-child-nodes node)
2066  (insert-before child-nodes)))))
2067 
2068 (defun element-attribute (node attribute &optional namespace)
2069  (cdr (assoc (cons attribute namespace)
2070  (%node-attributes node)
2071  :test #'equal)))
2072 
2073 (defun (setf element-attribute) (new-value node attribute
2074  &optional namespace)
2075  (check-type attribute string)
2076  (check-type new-value string)
2077  (let ((old-attr (assoc (cons attribute namespace)
2078  (%node-attributes node)
2079  :test #'equal)))
2080  (if old-attr
2081  (setf (cdr old-attr) new-value)
2082  (push (cons (cons attribute namespace) new-value) (%node-attributes node)))))
2083 
2084 ;;;
2085 ;;; Traversing
2086 ;;;
2087 
2088 (defun element-map-children (function node)
2089  (map nil function (%node-child-nodes node)))
2090 
2091 (defun element-map-attributes* (function node)
2092  (loop for ((name . namespace) . value) in (%node-attributes node)
2093  do (funcall function name namespace value)))
2094 
2095 (defun element-map-attributes (function node)
2096  (element-map-attributes*
2097  (lambda (name namespace value)
2098  (funcall function
2099  (if namespace
2100  (format nil "~A:~A" (find-prefix namespace) name)
2101  name)
2102  namespace
2103  value))
2104  node))
2105 
2106 ;;
2107 ;; Printing for the ease of debugging
2108 ;;
2109 
2110 (defun node-count (tree)
2111  (typecase tree
2112  (element (1+ (apply #'+ (mapcar #'node-count (%node-child-nodes tree)))))
2113  ((or document document-fragment)
2114  (apply #'+ (mapcar #'node-count (%node-child-nodes tree))))
2115  (t 1)))
2116 
2117 (defmethod print-object ((node document) stream)
2118  (print-unreadable-object (node stream :type t :identity t)
2119  (format stream "nodes: ~A" (node-count node))))
2120 
2121 (defmethod print-object ((node node) stream)
2122  (print-unreadable-object (node stream :type t :identity t)
2123  (format stream "~A" (node-name node))))
2124 
2125 (defmethod print-object ((node text-node) stream)
2126  (print-unreadable-object (node stream :type t :identity t)
2127  (write (node-value node) :stream stream :length 30)))
2128 
2129 ;;; html5-parser-class
2130 (defvar *parser*)
2131 
2132 (defclass html-parser ()
2133  ((html-namespace :initform (find-namespace "html"))
2134  (strict :initarg :strict)
2135  (inner-html-mode)
2136  (container :initform "div")
2137  (tokenizer)
2138  (document :initform (make-document))
2139  (errors :initform '())
2140  (phase :accessor parser-phase)
2141  first-start-tag
2142  compat-mode
2143  inner-html
2144  last-phase
2145  original-phase
2146  before-rcdata-phase
2147  (character-tokens :initform nil)
2148  frameset-ok
2149  open-elements
2150  active-formatting-elements
2151  head-pointer
2152  form-pointer
2153  insert-from-table
2154  (in-body-process-space-characters-mode :initform :non-pre)))
2155 
2156 ;;; tree-help
2157 (defmacro pop-end (place)
2158  "Pop from the end of list"
2159  (let ((old-list (gensym)))
2160  `(let ((,old-list ,place))
2161  (prog1 (car (last ,old-list))
2162  (setf ,place (butlast ,old-list))))))
2163 
2164 (defmacro push-end (object place)
2165  "Push to the end of list"
2166  `(progn
2167  ;(format t "~&push ~S to ~S" ',object ',place)
2168  (setf ,place (append ,place (list ,object)))))
2169 
2170 
2171 (defvar *parser*)
2172 
2173 (defun document* ()
2174  (slot-value *parser* 'document))
2175 
2176 (defun node-clone* (node)
2177  (ecase (node-type node)
2178  (:document
2179  (make-document))
2180  (:document-fragment
2181  (make-fragment (document*)))
2182  (:document-type
2183  (make-doctype (document*)
2184  (node-name node)
2185  (node-public-id node)
2186  (node-system-id node)))
2187  (:comment
2188  (make-comment (document*) (node-value node)))
2189  (:text
2190  (make-text-node (document*) (node-value node)))
2191  (:element
2192  (let ((clone (make-element (document*) (node-name node) (node-namespace node))))
2193  (element-map-attributes*
2194  (lambda (name namespace value)
2195  (setf (element-attribute clone name namespace) value))
2196  node)
2197  clone))))
2198 
2199 (defun node-name-tuple (node)
2200  (cons (or (node-namespace node)
2201  (find-namespace "html"))
2202  (node-name node)))
2203 
2204 (defun node-name-tuple-values (node)
2205  (values (or (node-namespace node)
2206  (find-namespace "html"))
2207  (node-name node)))
2208 
2209 (defun node-has-content (node)
2210  (not (null (node-first-child node))))
2211 
2212 (defun node-attributes= (node1 node2)
2213  (labels ((has-all-attributes-of (node1 node2)
2214  (element-map-attributes*
2215  (lambda (name namespace value)
2216  (unless (equal value
2217  (element-attribute node2 name namespace))
2218  (return-from has-all-attributes-of nil)))
2219  node1)
2220  t))
2221  (and (has-all-attributes-of node1 node2)
2222  (has-all-attributes-of node2 node1))))
2223 
2224 (defun node-append-child* (node child)
2225  (let ((last-child (node-last-child node)))
2226  (if (and (eql :text (node-type child))
2227  last-child
2228  (eql :text (node-type last-child)))
2229  (nconcatf (node-value last-child)
2230  (node-value child))
2231  (node-append-child node child))))
2232 
2233 (defun node-insert-before* (node child insert-before)
2234  (when (eql :text (node-type child))
2235  (let ((prev-child (node-previous-sibling insert-before)))
2236  (when (and prev-child
2237  (eql :text (node-type prev-child)))
2238  (node-remove-child node prev-child)
2239  (setf child (make-text-node
2240  (document*)
2241  (concatenate 'string
2242  (node-value prev-child)
2243  (node-value child)))))))
2244  (node-insert-before node child insert-before))
2245 
2246 (defun node-reparent-children (node new-parent)
2247  (element-map-children (lambda (child)
2248  (node-append-child new-parent child))
2249  node))
2250 
2251 (defun node-insert-text (node data &optional insert-before)
2252  (if insert-before
2253  (node-insert-before* node (make-text-node (document*) data) insert-before)
2254  (node-append-child* node (make-text-node (document*) data))))
2255 
2256 (defun last-open-element ()
2257  (with-slots (open-elements) *parser*
2258  (car (last open-elements))))
2259 
2260 (defun create-element (token)
2261  "Create an element but don't insert it anywhere"
2262  (with-slots (html-namespace) *parser*
2263  (let ((element (make-element (document*)
2264  (getf token :name)
2265  (or (getf token :namespace)
2266  html-namespace))))
2267  (loop for (name . value) in (getf token :data)
2268  do (if (consp name)
2269  (setf (element-attribute element (second name) (third name)) value)
2270  (setf (element-attribute element name) value)))
2271  element)))
2272 
2273 
2274 (defun insert-root (token)
2275  (with-slots (open-elements) *parser*
2276  (let ((element (create-element token)))
2277  (assert element)
2278  (push-end element open-elements)
2279  (node-append-child (document*) element))))
2280 
2281 (defun insert-doctype (token)
2282  (node-append-child (document*)
2283  (make-doctype (document*)
2284  (getf token :name)
2285  (getf token :public-id)
2286  (getf token :system-id))))
2287 
2288 (defun insert-comment (token &optional parent)
2289  (with-slots (open-elements) *parser*
2290  (unless parent
2291  (setf parent (car (last open-elements))))
2292  (node-append-child parent (make-comment (document*) (getf token :data)))))
2293 
2294 (defun insert-element-normal (token)
2295  (with-slots (open-elements) *parser*
2296  (let ((element (create-element token)))
2297  (node-append-child (last-open-element) element)
2298  (push-end element open-elements)
2299  element)))
2300 
2301 (defun insert-element-table (token)
2302  (with-slots (open-elements) *parser*
2303  (if (not (member (node-name (last-open-element))
2304  +table-insert-mode-elements+ :test #'string=))
2305  (insert-element-normal token)
2306  (let ((element (create-element token)))
2307  ;; We should be in the InTable mode. This means we want to do
2308  ;; special magic element rearranging
2309  (multiple-value-bind (parent insert-before)
2310  (get-table-misnested-nodeposition)
2311  (if (not insert-before)
2312  (node-append-child* parent element)
2313  (node-insert-before* parent element insert-before))
2314  (push-end element open-elements))
2315  element))))
2316 
2317 (defun insert-element (token)
2318  (with-slots (insert-from-table) *parser*
2319  (if insert-from-table
2320  (insert-element-table token)
2321  (insert-element-normal token))))
2322 
2323 (defun parser-insert-text (data &optional parent)
2324  "Insert text data."
2325  (with-slots (open-elements insert-from-table) *parser*
2326  (unless parent
2327  (setf parent (car (last open-elements))))
2328  (cond ((or (not insert-from-table)
2329  (and insert-from-table
2330  (not (member (node-name (last-open-element))
2331  +table-insert-mode-elements+ :test #'string=))))
2332  (node-insert-text parent data))
2333  (t
2334  ;; We should be in the InTable mode. This means we want to do
2335  ;; special magic element rearranging
2336  (multiple-value-bind (parent insert-before)
2337  (get-table-misnested-nodeposition)
2338  (node-insert-text parent data insert-before))))))
2339 
2340 (defun get-table-misnested-nodeposition ()
2341  "Get the foster parent element, and sibling to insert before
2342  (or None) when inserting a misnested table node"
2343  (with-slots (open-elements) *parser*
2344  ;; The foster parent element is the one which comes before the most
2345  ;; recently opened table element
2346  (let ((last-table (find "table" open-elements :key #'node-name :test #'string= :from-end t))
2347  (foster-parent nil)
2348  (insert-before nil))
2349 
2350  (cond (last-table
2351  ;; XXX - we should really check that this parent is actually a
2352  ;; node here
2353  (if (node-parent last-table)
2354  (setf foster-parent (node-parent last-table)
2355  insert-before last-table)
2356  (setf foster-parent (elt open-elements (1- (position last-table open-elements))))))
2357  (t
2358  (setf foster-parent (first open-elements))))
2359  (values foster-parent insert-before))))
2360 
2361 (defun generate-implied-end-tags (&optional exclude)
2362  (with-slots (open-elements) *parser*
2363  (let ((name (node-name (last-open-element))))
2364  ;; XXX td, th and tr are not actually needed
2365  (when (and (member name '("dd" "dt" "li" "option" "optgroup" "p" "rp" "rt") :test #'string=)
2366  (not (equal name exclude)))
2367  (pop-end open-elements)
2368  ;; XXX This is not entirely what the specification says. We should
2369  ;; investigate it more closely.
2370  (generate-implied-end-tags exclude)))))
2371 
2372 (defun reconstruct-active-formatting-elements ()
2373  ;; Within this algorithm the order of steps described in the
2374  ;; specification is not quite the same as the order of steps in the
2375  ;; code. It should still do the same though.
2376  (with-slots (active-formatting-elements open-elements) *parser*
2377 
2378  ;; Step 1: stop the algorithm when there's nothing to do.
2379  (unless active-formatting-elements
2380  (return-from reconstruct-active-formatting-elements))
2381 
2382  ;; Step 2 and step 3: we start with the last element. So i is -1.
2383  (let* ((i (1- (length active-formatting-elements)))
2384  (entry (elt active-formatting-elements i)))
2385  (when (or (eql entry :marker)
2386  (member entry open-elements))
2387  (return-from reconstruct-active-formatting-elements))
2388 
2389  ;; Step 6
2390  (loop while (and (not (eql entry :marker))
2391  (not (member entry open-elements))) do
2392  (when (zerop i)
2393  ;; This will be reset to 0 below
2394  (setf i -1)
2395  (return))
2396  (decf i)
2397  ;; Step 5: let entry be one earlier in the list.
2398  (setf entry (elt active-formatting-elements i)))
2399 
2400  (loop
2401  ;; Step 7
2402  (incf i)
2403 
2404  ;; Step 8
2405  (setf entry (elt active-formatting-elements i))
2406 
2407  ;; Step 9
2408  (let* ((element (insert-element (list :type :start-tag
2409  :name (node-name entry)
2410  :namespace (node-namespace entry)))))
2411  (element-map-attributes* (lambda (name namespace value)
2412  (setf (element-attribute element name namespace) value))
2413  entry)
2414 
2415  ;; Step 10
2416  (setf (elt active-formatting-elements i) element)
2417 
2418  ;; Step 11
2419  (when (eql element (car (last active-formatting-elements)))
2420  (return)))))))
2421 
2422 (defun clear-active-formatting-elements ()
2423  (with-slots (active-formatting-elements) *parser*
2424  (loop for entry = (pop-end active-formatting-elements)
2425  while (and active-formatting-elements
2426  (not (eql entry :marker))))))
2427 
2428 (defun element-in-active-formatting-elements (name)
2429  "Check if an element exists between the end of the active
2430  formatting elements and the last marker. If it does, return it, else
2431  return false"
2432  (with-slots (active-formatting-elements) *parser*
2433  (loop for item in (reverse active-formatting-elements) do
2434  ;; Check for Marker first because if it's a Marker it doesn't have a
2435  ;; name attribute.
2436  (when (eql item :marker)
2437  (return nil))
2438  (when (string= (node-name item) name)
2439  (return item)))))
2440 
2441 (defun scope-tree ()
2442  (load-time-value
2443  (flet ((unflatten (alist)
2444  "Turn an alist into a tree."
2445  (let ((alist2
2446  (mapcar #'list
2447  (remove-duplicates (mapcar #'car alist)
2448  :test #'equal))))
2449  (loop for (key . value) in alist
2450  do (push value (cdr (assoc key alist2
2451  :test #'equal))))
2452  ;; Put the XHTML ns first.
2453  (sort alist2 #'<
2454  :key (lambda (pair)
2455  (position (car pair)
2456  '("http://www.w3.org/1999/xhtml"
2457  "http://www.w3.org/2000/svg"
2458  "http://www.w3.org/1998/Math/MathML")
2459  :test #'string=))))))
2460  (let ((html (find-namespace "html")))
2461  `((nil . ,(unflatten +scoping-elements+))
2462  ("button" . ,(unflatten
2463  `(,@+scoping-elements+
2464  (,html . "button"))))
2465  ("list" . ,(unflatten
2466  `(,@+scoping-elements+
2467  (,html . "ol")
2468  (,html . "ul"))))
2469  ("table" . ((,html "html" "table")))
2470  ("select" . ((,html "optgroup" "option"))))))))
2471 
2472 (defun element-in-scope (target &optional variant)
2473  (let ((list-elements
2474  (cdr (assoc variant (scope-tree) :test #'equal)))
2475  (invert (equal "select" variant)))
2476  (dolist (node (reverse (slot-value *parser* 'open-elements)))
2477  (when (or (and (stringp target)
2478  (string= (node-name node) target))
2479  (eql node target))
2480  (return-from element-in-scope t))
2481 
2482  (multiple-value-bind (ns name)
2483  (node-name-tuple-values node)
2484  (let ((found (member name (cdr (assoc ns list-elements :test #'string=))
2485  :test #'string=)))
2486  (when invert
2487  (setf found (not found)))
2488  (when found
2489  (return-from element-in-scope nil)))))
2490 
2491  (error "We should never reach this point")))
2492 
2493 ;;; Parser
2494 ;; external interface
2495 (defun parse-html5 (source &key encoding strictp container dom)
2496  (parse-html5-from-source source
2497  :encoding encoding
2498  :strictp strictp
2499  :container container
2500  :dom dom))
2501 
2502 (defun parse-html5-fragment (source &key encoding strictp (container "div") dom)
2503  (parse-html5-from-source source
2504  :encoding encoding
2505  :strictp strictp
2506  :container container
2507  :dom dom))
2508 
2509 (defgeneric transform-html5-dom (to-type node &key)
2510  (:method ((to-type cons) node &key)
2511  (apply #'transform-html5-dom (car to-type) node (cdr to-type)))
2512  (:method (to-type node &key &allow-other-keys)
2513  (error "No TRANSFORM-HTML5-DOM method defined for dom type ~S." to-type)))
2514 
2515 
2516 ;; internal
2517 
2518 (defun parse-html5-from-source (source &key container encoding strictp dom)
2519  (let ((*parser* (make-instance 'html-parser
2520  :strict strictp)))
2521  (parser-parse source
2522  :fragment-p container
2523  :encoding encoding)
2524  (with-slots (open-elements errors) *parser*
2525  (let ((document
2526  (if container
2527  (let ((fragment (make-fragment (document*))))
2528  (node-reparent-children (first open-elements) fragment)
2529  fragment)
2530  (document*))))
2531  (values (if dom
2532  (transform-html5-dom dom document)
2533  document)
2534  (reverse errors))))))
2535 
2536 (defvar *phase*)
2537 
2538 (defun ascii-ichar= (char1 char2)
2539  "ASCII case-insensitive char="
2540  (or (char= char1 char2)
2541  (and (or (char<= #\A char1 #\Z)
2542  (char<= #\A char2 #\Z))
2543  (char= (char-downcase char1)
2544  (char-downcase char2)))))
2545 
2546 (defun ascii-istring= (string1 string2)
2547  "ASCII case-insensitive string="
2548  (every #'ascii-ichar= string1 string2))
2549 
2550 (defun cdata-switch-helper ()
2551  (and (last-open-element)
2552  (not (equal (node-namespace (last-open-element))
2553  (slot-value *parser* 'html-namespace)))))
2554 
2555 (defun parser-parse (source &key fragment-p encoding)
2556  (with-slots (inner-html-mode container tokenizer)
2557  *parser*
2558  (setf inner-html-mode fragment-p)
2559  (when (stringp fragment-p)
2560  (setf container fragment-p))
2561  (setf tokenizer (make-html-tokenizer source
2562  :encoding encoding
2563 
2564  :cdata-switch-helper #'cdata-switch-helper))
2565  (parser-reset)
2566  (loop
2567  ;; The input stream will throw please-reparse with result true
2568  ;; if the encoding is changed
2569  while (catch 'please-reparse
2570  (main-loop)
2571  nil)
2572  do (parser-reset))))
2573 
2574 (defun parser-reset ()
2575  (with-slots (open-elements active-formatting-elements
2576  head-pointer form-pointer insert-from-table
2577  first-start-tag errors compat-mode inner-html-mode
2578  inner-html container tokenizer phase last-phase
2579  before-rcdata-phase frameset-ok
2580  html-namespace)
2581  *parser*
2582  (setf open-elements '())
2583  (setf active-formatting-elements '())
2584  (setf head-pointer nil)
2585  (setf form-pointer nil)
2586  (setf insert-from-table nil)
2587  (setf first-start-tag nil)
2588  (setf errors '())
2589  (setf compat-mode :no-quirks)
2590  (cond (inner-html-mode
2591  (setf inner-html (string-downcase container))
2592  (cond ((member inner-html +cdata-elements+ :test #'string=)
2593  (setf (slot-value tokenizer 'state) :rcdata-state))
2594  ((member inner-html +rcdata-elements+ :test #'string=)
2595  (setf (slot-value tokenizer 'state) :rawtext-state))
2596  ((string= inner-html "plaintext")
2597  (setf (slot-value tokenizer 'state) :plaintext-state)))
2598  (insert-root (implied-tag-token "html" :start-tag))
2599  (setf phase :before-head)
2600  (reset-insertion-mode))
2601  (t
2602  (setf inner-html nil)
2603  (setf phase :initial)))
2604 
2605  (setf last-phase nil)
2606  (setf before-rcdata-phase nil)
2607  (setf frameset-ok t)))
2608 
2609 (defun is-html-integration-point (element)
2610  (if (and (string= (node-name element) "annotation-xml")
2611  (string= (node-namespace element) (find-namespace "mathml")))
2612  (and (element-attribute element "encoding")
2613  (member (ascii-upper-2-lower (element-attribute element "encoding"))
2614  '("text/html" "application/xhtml+xml")
2615  :test #'string=))
2616  (member (node-name-tuple element)
2617  +html-integration-point-elements+
2618  :test #'equal)))
2619 
2620 (defun is-math-ml-text-integration-point (element)
2621  (member (node-name-tuple element)
2622  +mathml-text-integration-point-elements+
2623  :test #'equal))
2624 
2625 (defun main-loop ()
2626  (with-slots (tokenizer phase)
2627  *parser*
2628  (map-tokens tokenizer (lambda (token)
2629  (process-token (normalize-token token))))
2630  (loop with reprocess = t
2631  with phases = '()
2632  while reprocess do
2633  (push phase phases)
2634  (setf reprocess (process-eof nil :phase phase))
2635  (when reprocess
2636  (assert (not (member phase phases)))))))
2637 
2638 (defun process-token (token)
2639  (with-slots (tokenizer last-open-element html-namespace)
2640  *parser*
2641  (let ((new-token token)
2642  (type))
2643  (loop while new-token do
2644  (let* ((current-node (last-open-element))
2645  (current-node-namespace (if current-node (node-namespace current-node)))
2646  (current-node-name (if current-node (node-name current-node))))
2647 
2648  (setf type (getf new-token :type))
2649 
2650  (cond ((eql type :parse-error)
2651  (parser-parse-error (getf token :data) (getf token :datavars))
2652  (setf new-token nil))
2653  (t
2654  (let (phase)
2655  (if (or (null (slot-value *parser* 'open-elements))
2656  (equal current-node-namespace html-namespace)
2657  (and (is-math-ml-text-integration-point current-node)
2658  (or (and (eql type :start-tag)
2659  (not (member (getf token :name) '("mglyph" "malignmark") :test #'string=)))
2660  (eql type :characters)
2661  (eql type :space-characters)))
2662  (and (equal current-node-namespace (find-namespace "mathml"))
2663  (equal current-node-name "annotation-xml")
2664  (eql type :start-tag)
2665  (equal (getf token :name) "svg"))
2666  (and (is-html-integration-point current-node)
2667  (member type '(:start-tag :characters :space-characters))))
2668  (setf phase (slot-value *parser* 'phase))
2669  (setf phase :in-foreign-content))
2670  ;(format t "~&phase ~S token ~S~%" phase new-token)
2671  (setf new-token
2672  (ecase type
2673  (:characters
2674  (process-characters new-token :phase phase))
2675  (:space-characters
2676  (process-space-characters new-token :phase phase))
2677  (:start-tag
2678  (process-start-tag new-token :phase phase))
2679  (:end-tag
2680  (process-end-tag new-token :phase phase))
2681  (:comment
2682  (process-comment new-token :phase phase))
2683  (:doctype
2684  (process-doctype new-token :phase phase))))
2685  ;(format t " phase returned ~S new-token ~S~%" phase new-token)
2686  ))))
2687  (when (and (eql type :start-tag)
2688  (getf token :self-closing)
2689  (not (getf token :self-closing-acknowledged)))
2690  (parser-parse-error :non-void-element-with-trailing-solidus
2691  `(:name ,(getf token :name))))))))
2692 
2693 (defun parser-parse-error (error-code &optional datavars)
2694  (with-slots (errors) *parser*
2695  (push (list error-code datavars) errors)))
2696 
2697 ;; TODO rename to a longer and more descriptive name when we are done writing the code
2698 (defun perror (error-code &rest datavars)
2699  (parser-parse-error error-code datavars))
2700 
2701 (defun normalize-token (token)
2702  (when (getf token :start-tag)
2703  ;; Remove duplicate attributes
2704  (setf (getf token :data) (remove-duplicates (getf token :data)
2705  :key #'car
2706  :test #'string=
2707  :from-end t)))
2708  token)
2709 
2710 (defun adjust-attributes (token replacements)
2711  (setf (getf token :data)
2712  (loop for (name . value) in (getf token :data)
2713  collect (cons (or (cdr (assoc name replacements :test #'string=))
2714  name)
2715  value))))
2716 
2717 (defun adjust-math-ml-attributes (token)
2718  (adjust-attributes token '(("definitionurl" ."definitionURL"))))
2719 
2720 (defun adjust-svg-attributes (token)
2721  (adjust-attributes token '(("attributename" . "attributeName")
2722  ("attributetype" . "attributeType")
2723  ("basefrequency" . "baseFrequency")
2724  ("baseprofile" . "baseProfile")
2725  ("calcmode" . "calcMode")
2726  ("clippathunits" . "clipPathUnits")
2727  ("contentscripttype" . "contentScriptType")
2728  ("contentstyletype" . "contentStyleType")
2729  ("diffuseconstant" . "diffuseConstant")
2730  ("edgemode" . "edgeMode")
2731  ("externalresourcesrequired" . "externalResourcesRequired")
2732  ("filterres" . "filterRes")
2733  ("filterunits" . "filterUnits")
2734  ("glyphref" . "glyphRef")
2735  ("gradienttransform" . "gradientTransform")
2736  ("gradientunits" . "gradientUnits")
2737  ("kernelmatrix" . "kernelMatrix")
2738  ("kernelunitlength" . "kernelUnitLength")
2739  ("keypoints" . "keyPoints")
2740  ("keysplines" . "keySplines")
2741  ("keytimes" . "keyTimes")
2742  ("lengthadjust" . "lengthAdjust")
2743  ("limitingconeangle" . "limitingConeAngle")
2744  ("markerheight" . "markerHeight")
2745  ("markerunits" . "markerUnits")
2746  ("markerwidth" . "markerWidth")
2747  ("maskcontentunits" . "maskContentUnits")
2748  ("maskunits" . "maskUnits")
2749  ("numoctaves" . "numOctaves")
2750  ("pathlength" . "pathLength")
2751  ("patterncontentunits" . "patternContentUnits")
2752  ("patterntransform" . "patternTransform")
2753  ("patternunits" . "patternUnits")
2754  ("pointsatx" . "pointsAtX")
2755  ("pointsaty" . "pointsAtY")
2756  ("pointsatz" . "pointsAtZ")
2757  ("preservealpha" . "preserveAlpha")
2758  ("preserveaspectratio" . "preserveAspectRatio")
2759  ("primitiveunits" . "primitiveUnits")
2760  ("refx" . "refX")
2761  ("refy" . "refY")
2762  ("repeatcount" . "repeatCount")
2763  ("repeatdur" . "repeatDur")
2764  ("requiredextensions" . "requiredExtensions")
2765  ("requiredfeatures" . "requiredFeatures")
2766  ("specularconstant" . "specularConstant")
2767  ("specularexponent" . "specularExponent")
2768  ("spreadmethod" . "spreadMethod")
2769  ("startoffset" . "startOffset")
2770  ("stddeviation" . "stdDeviation")
2771  ("stitchtiles" . "stitchTiles")
2772  ("surfacescale" . "surfaceScale")
2773  ("systemlanguage" . "systemLanguage")
2774  ("tablevalues" . "tableValues")
2775  ("targetx" . "targetX")
2776  ("targety" . "targetY")
2777  ("textlength" . "textLength")
2778  ("viewbox" . "viewBox")
2779  ("viewtarget" . "viewTarget")
2780  ("xchannelselector" . "xChannelSelector")
2781  ("ychannelselector" . "yChannelSelector")
2782  ("zoomandpan" . "zoomAndPan"))))
2783 
2784 (defun adjust-foreign-attributes (token)
2785  (adjust-attributes token `(("xlink:actuate" . ("xlink" "actuate" ,(find-namespace "xlink")))
2786  ("xlink:arcrole" . ("xlink" "arcrole" ,(find-namespace "xlink")))
2787  ("xlink:href" . ("xlink" "href" ,(find-namespace "xlink")))
2788  ("xlink:role" . ("xlink" "role" ,(find-namespace "xlink")))
2789  ("xlink:show" . ("xlink" "show" ,(find-namespace "xlink")))
2790  ("xlink:title" . ("xlink" "title" ,(find-namespace "xlink")))
2791  ("xlink:type" . ("xlink" "type" ,(find-namespace "xlink")))
2792  ("xml:base" . ("xml" "base" ,(find-namespace "xml")))
2793  ("xml:lang" . ("xml" "lang" ,(find-namespace "xml")))
2794  ("xml:space" . ("xml" "space" ,(find-namespace "xml")))
2795  ("xmlns" . (nil "xmlns" ,(find-namespace "xmlns")))
2796  ("xmlns:xlink" . ("xmlns" "xlink" ,(find-namespace "xmlns"))))))
2797 
2798 (defun reset-insertion-mode ()
2799  (with-slots (inner-html html-namespace phase open-elements) *parser*
2800  (let ((last nil)
2801  (new-phase nil)
2802  (new-modes '(("select" . :in-select)
2803  ("td" . :in-cell)
2804  ("th" . :in-cell)
2805  ("tr" . :in-row)
2806  ("tbody" . :in-table-body)
2807  ("thead" . :in-table-body)
2808  ("tfoot" . :in-table-body)
2809  ("caption" . :in-caption)
2810  ("colgroup" . :in-column-group)
2811  ("table" . :in-table)
2812  ("head" . :in-body)
2813  ("body" . :in-body)
2814  ("frameset" . :in-frameset)
2815  ("html" . :before-head))))
2816  (loop for node in (reverse open-elements)
2817  for node-name = (node-name node)
2818  do
2819  (when (eql node (first open-elements))
2820  (assert inner-html)
2821  (setf last t)
2822  (setf node-name inner-html))
2823  ;; Check for conditions that should only happen in the innerHTML
2824  ;; case
2825  (when (member node-name '("select" "colgroup" "head" "html") :test #'string=)
2826  (assert inner-html))
2827 
2828  (unless (and (not last)
2829  (string/= (node-namespace node) html-namespace))
2830  (let ((match (cdr (assoc node-name new-modes :test #'string=))))
2831  (when match
2832  (setf new-phase match)
2833  (return))
2834  (when last
2835  (setf new-phase :in-body)
2836  (return)))))
2837  (setf phase new-phase))))
2838 
2839 (defun parse-rc-data-raw-text (token content-type)
2840  (assert (member content-type '(:rawtext :rcdata)))
2841  (with-slots (tokenizer original-phase phase) *parser*
2842  (insert-element token)
2843  (setf (tokenizer-state tokenizer) (ecase content-type
2844  (:rawtext :rawtext-state)
2845  (:rcdata :rcdata-state)))
2846  (setf original-phase phase)
2847  (setf phase :text)
2848  nil))
2849 
2850 
2851 ;; Phases --------------------------------------------------------------------
2852 
2853 (defun implied-tag-token (name &optional (type :end-tag))
2854  (list :type type :name name :data '() :self-closing nil))
2855 
2856 (defun implied-tag-token/full (name type
2857  &key (attributes '()) (self-closing nil))
2858  (list :type type :name name :data attributes :self-closing self-closing))
2859 
2860 (eval-when (:compile-toplevel :execute)
2861  (defun phase-process-method-name (function-name)
2862  (intern (concatenate 'string
2863  "%"
2864  (symbol-name function-name))
2865  (symbol-package function-name))))
2866 
2867 (defvar *phase-indent* 0)
2868 
2869 (defun call-phase-method (name phase token)
2870  ;(format *trace-output* "~&~vTcall: ~S ~S ~S" *phase-indent* name phase token)
2871  ;(break)
2872  (let ((result (let ((*phase-indent* (+ 4 *phase-indent*)))
2873  (funcall name phase token))))
2874  ;(format *trace-output* "~&~vTreturn: ~S ~S" *phase-indent* name result)
2875  result))
2876 
2877 (defmacro define-phase-process-functions (&body defs)
2878  `(progn
2879  ,@(loop for function-name in defs
2880  for method-name = (phase-process-method-name function-name)
2881  collect `(defgeneric ,method-name (phase token))
2882  collect `(defun ,function-name (token &key (phase *phase*))
2883  (call-phase-method #',method-name phase token)))))
2884 
2885 (define-phase-process-functions
2886  add-formatting-element
2887  end-tag-applet-marquee-object
2888  end-tag-block
2889  end-tag-body
2890  end-tag-br
2891  end-tag-caption
2892  end-tag-col
2893  end-tag-colgroup
2894  end-tag-form
2895  end-tag-formatting
2896  end-tag-frameset
2897  end-tag-head
2898  end-tag-heading
2899  end-tag-html
2900  end-tag-html-body-br
2901  end-tag-ignore
2902  end-tag-imply
2903  end-tag-imply-head
2904  end-tag-list-item
2905  end-tag-optgroup
2906  end-tag-option
2907  end-tag-other
2908  end-tag-p
2909  end-tag-script
2910  end-tag-select
2911  end-tag-table
2912  end-tag-table-cell
2913  end-tag-table-row-group
2914  end-tag-tr
2915  insert-text
2916  process-characters
2917  process-comment
2918  process-doctype
2919  process-end-tag
2920  process-eof
2921  process-space-characters
2922  process-start-tag
2923  start-tag-a
2924  start-tag-applet-marquee-object
2925  start-tag-base-link-command
2926  start-tag-body
2927  start-tag-button
2928  start-tag-caption
2929  start-tag-close-p
2930  start-tag-col
2931  start-tag-colgroup
2932  start-tag-form
2933  start-tag-formatting
2934  start-tag-frame
2935  start-tag-frameset
2936  start-tag-from-head
2937  start-tag-head
2938  start-tag-heading
2939  start-tag-hr
2940  start-tag-html
2941  start-tag-i-frame
2942  start-tag-image
2943  start-tag-imply-tbody
2944  start-tag-input
2945  start-tag-is-index
2946  start-tag-list-item
2947  start-tag-math
2948  start-tag-meta
2949  start-tag-misplaced
2950  start-tag-no-script-no-frames-style
2951  start-tag-nobr
2952  start-tag-noframes
2953  start-tag-opt
2954  start-tag-optgroup
2955  start-tag-option
2956  start-tag-other
2957  start-tag-param-source
2958  start-tag-plaintext
2959  start-tag-pre-listing
2960  start-tag-process-in-head
2961  start-tag-rawtext
2962  start-tag-row-group
2963  start-tag-rp-rt
2964  start-tag-script
2965  start-tag-select
2966  start-tag-style-script
2967  start-tag-svg
2968  start-tag-table
2969  start-tag-table-cell
2970  start-tag-table-element
2971  start-tag-table-other
2972  start-tag-textarea
2973  start-tag-title
2974  start-tag-tr
2975  start-tag-void-formatting
2976  start-tag-xmp)
2977 
2978 (defmacro def (phase name (&rest slots) &body body)
2979  `(defmethod ,(phase-process-method-name name) ((*phase* (eql ,phase)) token)
2980  (with-slots (,@slots) *parser*
2981  ,@body)))
2982 
2983 (defmacro tagname-dispatch (phase name &body cases)
2984  `(def ,phase ,name ()
2985  (let ((tagname (getf token :name)))
2986  (declare (ignorable tagname))
2987  ,(let* ((default '(error "Unhandled tag ~S" tagname))
2988  (string-cases
2989  (loop for (tagnames function) in cases
2990  append (cond ((stringp tagnames)
2991  `((,tagnames (,function token))))
2992  ((consp tagnames)
2993  (loop for tag in tagnames
2994  collect `(,tag (,function token))))
2995  ((eql 'default tagnames)
2996  (setf default `(,function token))
2997  nil)
2998  (t (error "Invalid tag name clause ~S" tagnames))))))
2999  (if (not string-cases)
3000  default
3001  `(string-case
3002  (tagname :default ,default)
3003  ,@string-cases))))))
3004 
3005 ;; Default methods
3006 
3007 (defmethod %process-comment (*phase* token)
3008  ;; For most phases the following is correct. Where it's not it will be
3009  ;; overridden.
3010  (insert-comment token (last-open-element))
3011  nil)
3012 
3013 (defmethod %process-doctype (*phase* token)
3014  (parser-parse-error :unexpected-doctype)
3015  nil)
3016 
3017 (defmethod %process-characters (*phase* token)
3018  (parser-insert-text (getf token :data))
3019  nil)
3020 
3021 (defmethod %process-space-characters (*phase* token)
3022  (parser-insert-text (getf token :data))
3023  nil)
3024 
3025 (defmethod %start-tag-html (*phase* token)
3026  (with-slots (first-start-tag open-elements)
3027  *parser*
3028  (when (and (not first-start-tag)
3029  (string= (getf token :name) "html"))
3030  (parser-parse-error :non-html-root))
3031  ;; XXX Need a check here to see if the first start tag token emitted is
3032  ;; this token... If it's not, invoke self.parser.parseError().
3033  (let ((root-element (first open-elements)))
3034  (loop for (name . value) in (getf token :data)
3035  do (unless (element-attribute root-element name)
3036  (setf (element-attribute root-element name) value))))
3037  (setf first-start-tag nil)
3038  nil))
3039 
3040 
3041 ;; InitialPhase
3042 
3043 (def :initial process-space-characters ()
3044  nil)
3045 
3046 (def :initial process-comment ()
3047  (insert-comment token (document*))
3048  nil)
3049 
3050 (def :initial process-doctype (compat-mode phase)
3051  (destructuring-bind (&key name public-id system-id correct &allow-other-keys)
3052  token
3053 
3054  (when (or (string/= name "html")
3055  public-id
3056  (and system-id (string/= system-id "about:legacy-compat")))
3057  (parser-parse-error :unknown-doctype))
3058 
3059  (unless public-id
3060  (setf public-id ""))
3061 
3062  (insert-doctype token)
3063 
3064  (setf public-id (ascii-upper-2-lower public-id))
3065 
3066  (cond ((or (not correct)
3067  (string/= name "html")
3068  (cl-ppcre:scan +quirks-mode-doctypes-regexp+ public-id)
3069  (member public-id '("-//w3o//dtd w3 html strict 3.0//en//"
3070  "-/w3c/dtd html 4.0 transitional/en"
3071  "html")
3072  :test #'string=)
3073  (and (not system-id)
3074  (cl-ppcre:scan '(:sequence :start-anchor (:alternation
3075  "-//w3c//dtd html 4.01 frameset//"
3076  "-//w3c//dtd html 4.01 transitional//"))
3077  public-id))
3078  (and system-id
3079  (equal (ascii-upper-2-lower system-id)
3080  "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")))
3081  (setf compat-mode :quirks))
3082  ((or (cl-ppcre:scan '(:sequence :start-anchor (:alternation
3083  "-//w3c//dtd xhtml 1.0 frameset//"
3084  "-//w3c//dtd xhtml 1.0 transitional//"))
3085  public-id)
3086  (and system-id
3087  (cl-ppcre:scan '(:sequence :start-anchor (:alternation
3088  "-//w3c//dtd html 4.01 frameset//"
3089  "-//w3c//dtd html 4.01 transitional//"))
3090  public-id)))
3091  (setf compat-mode :limited-quirks)))
3092  (setf phase :before-html)
3093  nil))
3094 
3095 (flet ((anything-else ()
3096  (with-slots (compat-mode phase)
3097  *parser*
3098  (setf compat-mode :quirks)
3099  (setf phase :before-html))))
3100 
3101  (def :initial process-characters ()
3102  (parser-parse-error :expected-doctype-but-got-chars)
3103  (anything-else)
3104  token)
3105 
3106  (def :initial process-start-tag ()
3107  (parser-parse-error :expected-doctype-but-got-start-tag
3108  (list :name (getf token :name)))
3109  (anything-else)
3110  token)
3111 
3112  (def :initial process-end-tag ()
3113  (parser-parse-error :expected-doctype-but-got-end-tag
3114  (list :name (getf token :name)))
3115  (anything-else)
3116  token)
3117 
3118  (def :initial process-eof ()
3119  (parser-parse-error :expected-doctype-but-got-eof)
3120  (anything-else)
3121  t))
3122 
3123 
3124 ;; BeforeHtmlPhase
3125 
3126 (flet ((insert-html-element ()
3127  (insert-root (implied-tag-token "html" :start-tag))
3128  (setf (parser-phase *parser*) :before-head)))
3129 
3130 
3131  (def :before-html process-eof ()
3132  (insert-html-element)
3133  t)
3134 
3135  (def :before-html process-comment ()
3136  (insert-comment token (document*))
3137  nil)
3138 
3139  (def :before-html process-space-characters ()
3140  nil)
3141 
3142  (def :before-html process-characters ()
3143  (insert-html-element)
3144  token)
3145 
3146  (def :before-html process-start-tag (first-start-tag)
3147  (when (string= (getf token :name) "html")
3148  (setf first-start-tag t))
3149  (insert-html-element)
3150  token)
3151 
3152  (def :before-html process-end-tag ()
3153  (cond ((not (member (getf token :name) '("head" "body" "html" "br") :test #'string=))
3154  (parser-parse-error :unexpected-end-tag-before-html `(:name ,(getf token :name)))
3155  nil)
3156  (t
3157  (insert-html-element)
3158  token))))
3159 
3160 ;; BeforeHeadPhase
3161 
3162 (tagname-dispatch :before-head process-start-tag
3163  ("html" start-tag-html)
3164  ("head" start-tag-head token)
3165  (default start-tag-other))
3166 
3167 (tagname-dispatch :before-head process-end-tag
3168  (("head" "body" "html" "br") end-tag-imply-head)
3169  (default end-tag-other))
3170 
3171 (def :before-head process-eof ()
3172  (start-tag-head (implied-tag-token "head" :start-tag))
3173  t)
3174 
3175 (def :before-head process-space-characters ()
3176  nil)
3177 
3178 (def :before-head process-characters ()
3179  (start-tag-head (implied-tag-token "head" :start-tag))
3180  token)
3181 
3182 (def :before-head start-tag-html ()
3183  (process-start-tag token :phase :in-body))
3184 
3185 (def :before-head start-tag-head (head-pointer)
3186  (insert-element token)
3187  (setf head-pointer (last-open-element))
3188  (setf (parser-phase *parser*) :in-head)
3189  nil)
3190 
3191 (def :before-head start-tag-other ()
3192  (start-tag-head (implied-tag-token "head" :start-tag))
3193  token)
3194 
3195 (def :before-head end-tag-imply-head ()
3196  (start-tag-head (implied-tag-token "head" :start-tag))
3197  token)
3198 
3199 (def :before-head end-tag-other ()
3200  (parser-parse-error :end-tag-after-implied-root `(:name ,(getf token :name)))
3201  nil)
3202 
3203 ;; InHeadPhase
3204 
3205 (tagname-dispatch :in-head process-start-tag
3206  ("html" start-tag-html)
3207  ("title" start-tag-title)
3208  (("noscript" "noframes" "style") start-tag-no-script-no-frames-style)
3209  ("script" start-tag-script)
3210  (("base" "basefont" "bgsound" "command" "link") start-tag-base-link-command)
3211  ("meta" start-tag-meta)
3212  ("head" start-tag-head)
3213  (default start-tag-other))
3214 
3215 (tagname-dispatch :in-head process-end-tag
3216  ("head" end-tag-head)
3217  (("br" "html" "body") end-tag-html-body-br)
3218  (default end-tag-other))
3219 
3220 (flet ((anything-else ()
3221  (end-tag-head (implied-tag-token "head"))))
3222 
3223  ;; the real thing
3224  (def :in-head process-eof ()
3225  (anything-else)
3226  t)
3227 
3228  (def :in-head process-characters ()
3229  (anything-else)
3230  token)
3231 
3232  (def :in-head start-tag-html ()
3233  (process-start-tag token :phase :in-body))
3234 
3235  (def :in-head start-tag-head ()
3236  (parser-parse-error :two-heads-are-not-better-than-one)
3237  nil)
3238 
3239  (def :in-head start-tag-base-link-command (open-elements)
3240  (insert-element token)
3241  (pop-end open-elements)
3242  (setf (getf token :self-closing-acknowledged) t)
3243  nil)
3244 
3245  (defun parse-content-attr (string)
3246  "The algorithm for extracting an encoding from a meta element"
3247  (let ((position 0)) ; Step 1
3248  (labels ((char-at (index)
3249  (and (< position (length string))
3250  (char string index)))
3251  (skip-space ()
3252  (loop while (member (char-at position) +space-characters+)
3253  do (incf position))))
3254  ;; Step 2
3255  (loop
3256  (setf position (search "charset" string :start2 position))
3257  (unless position
3258  (return-from parse-content-attr))
3259  ;; Set position to after charset
3260  (incf position 7)
3261  ;; Step 3
3262  (skip-space)
3263  ;; Step 4
3264  (when (eql (char-at position) #\=)
3265  (return))
3266  (decf position))
3267  ;; Step 5
3268  (incf position)
3269  (skip-space)
3270  ;; Step 6
3271  (let ((next-char (char-at position)))
3272  (cond ((or (eql #\' next-char)
3273  (eql #\" next-char))
3274  (incf position)
3275  (let ((end (position next-char string :start position)))
3276  (when end
3277  (subseq string position end))))
3278  (next-char
3279  (let ((start position))
3280  (loop until (or (= position (length string))
3281  (member (char-at position) +space-characters+))
3282  do (incf position))
3283  (subseq string start position))))))))
3284 
3285 
3286  (def :in-head start-tag-meta (tokenizer open-elements)
3287  (insert-element token)
3288  (pop-end open-elements)
3289  (setf (getf token :self-closing-acknowledged) t)
3290 
3291  (let ((attributes (getf token :data)))
3292  (when (eql (cdr (html5-stream-encoding (tokenizer-stream tokenizer))) :tentative)
3293  (cond ((assoc "charset" attributes :test #'string=)
3294  (html5-stream-change-encoding (tokenizer-stream tokenizer)
3295  (cdr (assoc "charset" attributes :test #'string=))))
3296  ((and (assoc "http-equiv" attributes :test #'string=)
3297  (ascii-istring= (cdr (assoc "http-equiv" attributes :test #'string=))
3298  "Content-Type")
3299  (assoc "content" attributes :test #'string=))
3300  (let* ((content (cdr (assoc "content" attributes :test #'string=)))
3301  (new-encoding (parse-content-attr content)))
3302  (if new-encoding
3303  (html5-stream-change-encoding (tokenizer-stream tokenizer)
3304  new-encoding)
3305  (parser-parse-error :invalid-encoding-declaration
3306  `(:content ,content))))))))
3307  nil)
3308 
3309  (def :in-head start-tag-title ()
3310  (parse-rc-data-raw-text token :rcdata)
3311  nil)
3312 
3313  (def :in-head start-tag-no-script-no-frames-style ()
3314  ;; Need to decide whether to implement the scripting-disabled case
3315  (parse-rc-data-raw-text token :rawtext))
3316 
3317  (def :in-head start-tag-script (tokenizer original-phase phase)
3318  (insert-element token)
3319  (setf (tokenizer-state tokenizer) :script-data-state)
3320  (setf original-phase phase)
3321  (setf phase :text)
3322  nil)
3323 
3324  (def :in-head start-tag-other ()
3325  (anything-else)
3326  token)
3327 
3328  (def :in-head end-tag-head (phase open-elements)
3329  (let ((node (pop-end open-elements)))
3330  (assert (string= (node-name node) "head") () "Expected head got ~S" (node-name node))
3331  (setf phase :after-head)
3332  nil))
3333 
3334  (def :in-head end-tag-html-body-br ()
3335  (anything-else)
3336  token)
3337 
3338  (def :in-head end-tag-other ()
3339  (parser-parse-error :unexpected-end-tag `(:name ,(getf token :name)))
3340  nil))
3341 
3342 ;; XXX If we implement a parser for which scripting is disabled we need to
3343 ;; implement this phase.
3344 ;;
3345 ;; InHeadNoScriptPhase
3346 
3347 ;; AfterHeadPhase
3348 
3349 (tagname-dispatch :after-head process-start-tag
3350  ("html" start-tag-html)
3351  ("body" start-tag-body)
3352  ("frameset" start-tag-frameset)
3353  (("base" "basefont" "bgsound" "link" "meta"
3354  "noframes" "script" "style" "title")
3355  start-tag-from-head)
3356  ("head" start-tag-head)
3357  (default start-tag-other))
3358 
3359 (tagname-dispatch :after-head process-end-tag
3360  (("body" "html" "br") end-tag-html-body-br)
3361  (default end-tag-other))
3362 
3363 (flet ((anything-else ()
3364  (with-slots (phase frameset-ok) *parser*
3365  (insert-element (implied-tag-token "body" :start-tag))
3366  (setf phase :in-body)
3367  (setf frameset-ok t))))
3368 
3369  (def :after-head process-eof ()
3370  (anything-else)
3371  t)
3372 
3373  (def :after-head process-characters ()
3374  (anything-else)
3375  token)
3376 
3377  (def :after-head start-tag-html ()
3378  (process-start-tag token :phase :in-body))
3379 
3380  (def :after-head start-tag-body (phase frameset-ok)
3381  (setf frameset-ok nil)
3382  (insert-element token)
3383  (setf phase :in-body)
3384  nil)
3385 
3386  (def :after-head start-tag-frameset (phase)
3387  (insert-element token)
3388  (setf phase :in-frameset)
3389  nil)
3390 
3391  (def :after-head start-tag-from-head (head-pointer open-elements)
3392  (parser-parse-error :unexpected-start-tag-out-of-my-head
3393  `(:name ,(getf token :name)))
3394  (push-end head-pointer open-elements)
3395  (process-start-tag token :phase :in-head)
3396  (loop for node in (reverse open-elements)
3397  do (when (string= "head" (node-name node))
3398  (setf open-elements
3399  (remove node open-elements :test #'equal))
3400  (return)))
3401  nil)
3402 
3403  (def :after-head start-tag-head ()
3404  (parser-parse-error :unexpected-start-tag
3405  `(:name ,(getf token :name)))
3406  nil)
3407 
3408  (def :after-head start-tag-other ()
3409  (anything-else)
3410  token)
3411 
3412  (def :after-head end-tag-html-body-br ()
3413  (anything-else)
3414  token)
3415 
3416  (def :after-head end-tag-other ()
3417  (parser-parse-error :unexpected-end-tag
3418  `(:name ,(getf token :name)))
3419  nil))
3420 
3421 ;; InBodyPhase
3422 
3423 (tagname-dispatch :in-body process-start-tag
3424  ("html" start-tag-html)
3425  (("base" "basefont" "bgsound" "command" "link"
3426  "meta" "noframes" "script" "style" "title")
3427  start-tag-process-in-head)
3428  ("body" start-tag-body)
3429  ("frameset" start-tag-frameset)
3430  (("address" "article" "aside" "blockquote" "center" "details"
3431  "dir" "div" "dl" "fieldset" "figcaption" "figure"
3432  "footer" "header" "hgroup" "menu" "nav" "ol" "p"
3433  "section" "summary" "ul")
3434  start-tag-close-p)
3435  (#.+heading-elements+ start-tag-heading)
3436  (("pre" "listing") start-tag-pre-listing)
3437  ("form" start-tag-form)
3438  (("li" "dd" "dt") start-tag-list-item)
3439  ("plaintext" start-tag-plaintext)
3440  ("a" start-tag-a)
3441  (("b" "big" "code" "em" "font" "i" "s" "small" "strike"
3442  "strong" "tt" "u")
3443  start-tag-formatting)
3444  ("nobr" start-tag-nobr)
3445  ("button" start-tag-button)
3446  (("applet" "marquee" "object") start-tag-applet-marquee-object)
3447  ("xmp" start-tag-xmp)
3448  ("table" start-tag-table)
3449  (("area" "br" "embed" "img" "keygen" "wbr")
3450  start-tag-void-formatting)
3451  (("param" "source" "track") start-tag-param-source)
3452  ("input" start-tag-input)
3453  ("hr" start-tag-hr)
3454  ("image" start-tag-image)
3455  ("isindex" start-tag-is-index)
3456  ("textarea" start-tag-textarea)
3457  ("iframe" start-tag-i-frame)
3458  (("noembed" "noscript") start-tag-rawtext)
3459  ("select" start-tag-select)
3460  (("rp" "rt") start-tag-rp-rt)
3461  (("option" "optgroup") start-tag-opt)
3462  (("math") start-tag-math)
3463  (("svg") start-tag-svg)
3464  (("caption" "col" "colgroup" "frame" "head"
3465  "tbody" "td" "tfoot" "th" "thead"
3466  "tr")
3467  start-tag-misplaced)
3468  (default start-tag-other))
3469 
3470 (tagname-dispatch :in-body process-end-tag
3471  ("body" end-tag-body)
3472  ("html" end-tag-html)
3473  (("address" "article" "aside" "blockquote" "button" "center"
3474  "details" "dir" "div" "dl" "fieldset" "figcaption" "figure"
3475  "footer" "header" "hgroup" "listing" "menu" "nav" "ol" "pre"
3476  "section" "summary" "ul")
3477  end-tag-block)
3478  ("form" end-tag-form)
3479  ("p" end-tag-p)
3480  (("dd" "dt" "li") end-tag-list-item)
3481  (#.+heading-elements+ end-tag-heading)
3482  (("a" "b" "big" "code" "em" "font" "i" "nobr" "s" "small"
3483  "strike" "strong" "tt" "u")
3484  end-tag-formatting)
3485  (("applet" "marquee" "object") end-tag-applet-marquee-object)
3486  ("br" end-tag-br)
3487  (default end-tag-other))
3488 
3489 (flet ((is-matching-formatting-element (node1 node2)
3490  (and (equal (node-name node1) (node-name node2))
3491  (equal (node-namespace node1) (node-namespace node2))
3492  (node-attributes= node1 node2))))
3493 
3494  (def :in-body add-formatting-element (reverse active-formatting-elements)
3495  (insert-element token)
3496  (let ((element (last-open-element))
3497  matching-elements)
3498  (loop for node in (reverse active-formatting-elements)
3499  do (if (eq node :marker)
3500  (return)
3501  (when (is-matching-formatting-element node element)
3502  (push-end node matching-elements))))
3503  (assert (<= (length matching-elements) 3))
3504  (when (= (length matching-elements) 3)
3505  (setf active-formatting-elements
3506  (remove (car (last matching-elements))
3507  active-formatting-elements)))
3508  (assert element)
3509  (push-end element active-formatting-elements))
3510  nil))
3511 
3512 (def :in-body process-eof (open-elements)
3513  (let ((allowed-elements '("dd" "dt" "li" "p" "tbody" "td"
3514  "tfoot" "th" "thead" "tr" "body" "html")))
3515  (loop for node in (reverse open-elements)
3516  do (when (not (member (node-name node)
3517  allowed-elements
3518  :test #'string=))
3519  (parser-parse-error :expected-closing-tag-but-got-eof)
3520  (return))))
3521  nil)
3522 
3523 (def :in-body process-characters (frameset-ok)
3524  (let ((data (getf token :data)))
3525  (if (equal data (string #\u0000))
3526  nil
3527  (progn
3528  (reconstruct-active-formatting-elements)
3529  (parser-insert-text data)
3530  ;;This must be bad for performance
3531  (when (and frameset-ok
3532  (notevery (lambda (char)
3533  (find char +space-characters+))
3534  data))
3535  (setf frameset-ok nil))
3536  nil))))
3537 
3538 (def :in-body process-space-characters (in-body-process-space-characters-mode)
3539  (ecase in-body-process-space-characters-mode
3540  (:non-pre
3541  (reconstruct-active-formatting-elements)
3542  (parser-insert-text (getf token :data)))
3543  (:drop-newline
3544  (let ((data (getf token :data)))
3545  (setf in-body-process-space-characters-mode :non-pre)
3546  (when (and (plusp (length data))
3547  (char= #\Newline (char data 0))
3548  (member (node-name (last-open-element))
3549  '("pre" "listing" "textarea")
3550  :test #'string=)
3551  (not (node-has-content (last-open-element))))
3552  (setf data (subseq data 1)))
3553  (when (plusp (length data))
3554  (reconstruct-active-formatting-elements)
3555  (parser-insert-text data)))))
3556  nil)
3557 
3558 (def :in-body start-tag-process-in-head ()
3559  (process-start-tag token :phase :in-head))
3560 
3561 (def :in-body start-tag-body (frameset-ok open-elements)
3562  (parser-parse-error :unexpected-start-tag
3563  `(:name ,(getf token :name)))
3564  (if (or (= 1 (length open-elements))
3565  (string/= (node-name (second open-elements)) "body"))
3566  (assert (slot-value *parser* 'inner-html))
3567  (progn
3568  (setf frameset-ok nil)
3569  (loop for (name . value) in (getf token :data)
3570  do (unless (element-attribute (second open-elements) name)
3571  (setf (element-attribute (second open-elements) name) value)))))
3572  nil)
3573 
3574 (def :in-body start-tag-frameset (frameset-ok phase open-elements)
3575  (parser-parse-error :unexpected-start-tag
3576  `(:name ,(getf token :name)))
3577  (cond ((or (= 1 (length open-elements))
3578  (string/= (node-name (second open-elements)) "body"))
3579  (assert (slot-value *parser* 'inner-html)))
3580  ((not frameset-ok)
3581  nil)
3582  (t
3583  (when (node-parent (second open-elements))
3584  (node-remove-child (node-parent (second open-elements))
3585  (second open-elements)))
3586  (loop until (string= (node-name (last-open-element))
3587  "html")
3588  do (pop-end open-elements))
3589  (insert-element token)
3590  (setf phase :in-frameset)))
3591  nil)
3592 
3593 (def :in-body start-tag-close-p ()
3594  (when (element-in-scope "p" "button")
3595  (end-tag-p (implied-tag-token "p")))
3596  (insert-element token)
3597  nil)
3598 
3599 (def :in-body start-tag-pre-listing (in-body-process-space-characters-mode frameset-ok)
3600  (when (element-in-scope "p" "button")
3601  (end-tag-p (implied-tag-token "p")))
3602  (insert-element token)
3603  (setf frameset-ok nil)
3604  (setf in-body-process-space-characters-mode :drop-newline)
3605  nil)
3606 
3607 (def :in-body start-tag-form (form-pointer)
3608  (if form-pointer
3609  (parser-parse-error :unexpected-start-tag
3610  `(:name ,(getf token :name)))
3611  (progn
3612  (when (element-in-scope "p" "button")
3613  (end-tag-p (implied-tag-token "p")))
3614  (insert-element token)
3615  (setf form-pointer (last-open-element))))
3616  nil)
3617 
3618 (def :in-body start-tag-list-item (phase frameset-ok open-elements)
3619  (setf frameset-ok nil)
3620  (let ((stop-names (cond ((string= (getf token :name) "li")
3621  '("li"))
3622  ((string= (getf token :name) "dt")
3623  '("dt" "dd"))
3624  ((string= (getf token :name) "dd")
3625  '("dt" "dd")))))
3626  (loop for node in (reverse open-elements)
3627  do (cond ((member (node-name node) stop-names :test #'string=)
3628  (process-end-tag (implied-tag-token (node-name node)) :phase phase)
3629  (return))
3630  ((and (member (node-name-tuple node) +special-elements+
3631  :test #'equal)
3632  (not (member (node-name node)
3633  '("address" "div" "p")
3634  :test #'string=)))
3635  (return)))))
3636  (when (element-in-scope "p" "button")
3637  (process-end-tag (implied-tag-token "p") :phase phase))
3638  (insert-element token)
3639  nil)
3640 
3641 (def :in-body start-tag-plaintext (tokenizer)
3642  (when (element-in-scope "p" "button")
3643  (end-tag-p (implied-tag-token "p")))
3644  (insert-element token)
3645  (setf (tokenizer-state tokenizer) :plaintext-state)
3646  nil)
3647 
3648 (def :in-body start-tag-heading (open-elements)
3649  (when (element-in-scope "p" "button")
3650  (end-tag-p (implied-tag-token "p")))
3651  (when (member (node-name (last-open-element)) +heading-elements+
3652  :test #'string=)
3653  (perror :unexpected-start-tag :name (getf token :name))
3654  (pop-end open-elements))
3655  (insert-element token)
3656  nil)
3657 
3658 (def :in-body start-tag-a (open-elements active-formatting-elements)
3659  (let ((afe-a-element (element-in-active-formatting-elements "a")))
3660  (when afe-a-element
3661  (perror :unexpected-start-tag-implies-end-tag
3662  :start-name "a" :end-name "a")
3663  (end-tag-formatting (implied-tag-token "a"))
3664  (when (member afe-a-element open-elements)
3665  (setf open-elements
3666  (remove afe-a-element open-elements)))
3667  (when (member afe-a-element active-formatting-elements)
3668  (setf active-formatting-elements
3669  (remove afe-a-element active-formatting-elements))))
3670  (reconstruct-active-formatting-elements)
3671  (add-formatting-element token))
3672  nil)
3673 
3674 (def :in-body start-tag-formatting ()
3675  (reconstruct-active-formatting-elements)
3676  (add-formatting-element token)
3677  nil)
3678 
3679 (def :in-body start-tag-nobr ()
3680  (reconstruct-active-formatting-elements)
3681  (when (element-in-scope "nobr")
3682  (perror :unexpected-start-tag-implies-end-tag
3683  :start-name "nobr" :end-name "nobr")
3684  (process-end-tag (implied-tag-token "nobr"))
3685  ;; XXX Need tests that trigger the following
3686  (reconstruct-active-formatting-elements))
3687  (add-formatting-element token)
3688  nil)
3689 
3690 (def :in-body start-tag-button (frameset-ok)
3691  (cond ((element-in-scope "button")
3692  (perror :unexpected-start-tag-implies-end-tag
3693  :start-name "button" :end-name "button")
3694  (process-end-tag (implied-tag-token "button"))
3695  token)
3696  (t
3697  (reconstruct-active-formatting-elements)
3698  (insert-element token)
3699  (setf frameset-ok nil)
3700  nil)))
3701 
3702 (def :in-body start-tag-applet-marquee-object (frameset-ok active-formatting-elements)
3703  (reconstruct-active-formatting-elements)
3704  (insert-element token)
3705  (push-end :marker active-formatting-elements)
3706  (setf frameset-ok nil)
3707  nil)
3708 
3709 (def :in-body start-tag-xmp (frameset-ok)
3710  (when (element-in-scope "p" "button")
3711  (end-tag-p (implied-tag-token "p")))
3712  (reconstruct-active-formatting-elements)
3713  (setf frameset-ok nil)
3714  (parse-rc-data-raw-text token :rawtext)
3715  nil)
3716 
3717 (def :in-body start-tag-table (frameset-ok compat-mode phase)
3718  (when (not (eq compat-mode :quirks))
3719  (when (element-in-scope "p" "button")
3720  (end-tag-p (implied-tag-token "p"))))
3721  (insert-element token)
3722  (setf frameset-ok nil)
3723  (setf phase :in-table)
3724  nil)
3725 
3726 (def :in-body start-tag-void-formatting (frameset-ok open-elements)
3727  (reconstruct-active-formatting-elements)
3728  (insert-element token)
3729  (pop-end open-elements)
3730  (setf (getf token :self-closing-acknowledged) t)
3731  (setf frameset-ok nil)
3732  nil)
3733 
3734 (def :in-body start-tag-input (frameset-ok)
3735  (let ((old-frameset-ok frameset-ok))
3736  (start-tag-void-formatting token)
3737  (let ((type (assoc "type" (getf token :data) :test #'string=)))
3738  (when (and type
3739  (string= (ascii-upper-2-lower (cdr type)) "hidden"))
3740  ;;input type=hidden doesn't change framesetOK
3741  (setf frameset-ok old-frameset-ok))))
3742  nil)
3743 
3744 (def :in-body start-tag-param-source (open-elements)
3745  (insert-element token)
3746  (pop-end open-elements)
3747  (setf (getf token :self-closing-acknowledged) t)
3748  nil)
3749 
3750 (def :in-body start-tag-hr (frameset-ok open-elements)
3751  (when (element-in-scope "p" "button")
3752  (end-tag-p (implied-tag-token "p")))
3753  (insert-element token)
3754  (pop-end open-elements)
3755  (setf (getf token :self-closing-acknowledged) t)
3756  (setf frameset-ok nil)
3757  nil)
3758 
3759 (def :in-body start-tag-image ()
3760  (perror :unexpected-start-tag-treated-as
3761  :original-name "image" :new-name "img")
3762  (process-start-tag (implied-tag-token/full
3763  "img" :start-tag
3764  :attributes (getf token :data)
3765  :self-closing (getf token :self-closing)))
3766  nil)
3767 
3768 (def :in-body start-tag-is-index (form-pointer)
3769  (block nil
3770  (perror :deprecated-tag :name "isindex")
3771  (when form-pointer
3772  (return nil))
3773  (let (attrs)
3774  (when (assoc "action" (getf token :data) :test #'string=)
3775  (setf attrs (list (assoc "action" (getf token :data) :test #'string=))))
3776  (process-start-tag (implied-tag-token/full "form" :start-tag
3777  :attributes attrs)))
3778  (process-start-tag (implied-tag-token "hr" :start-tag))
3779  (process-start-tag (implied-tag-token "label" :start-tag))
3780  ;; XXX Localization ...
3781  (let ((prompt (if (assoc "prompt" (getf token :data) :test #'string=)
3782  (cdr (assoc "prompt" (getf token :data) :test #'string=))
3783  "This is a searchable index. Enter search keywords: ")))
3784  (process-characters (list :type :characters :data prompt)))
3785  (let ((attrs (append (remove-if (lambda (el)
3786  (member (car el) '("action" "prompt" "name")
3787  :test #'string=))
3788  (copy-list (getf token :data)))
3789  (copy-list '(("name" . "isindex"))))))
3790  (process-start-tag (implied-tag-token/full "input" :start-tag
3791  :attributes attrs
3792  :self-closing
3793  (getf token :self-closing))))
3794  (process-end-tag (implied-tag-token "label"))
3795  (process-start-tag (implied-tag-token "hr" :start-tag))
3796  (process-end-tag (implied-tag-token "form")))
3797  nil)
3798 
3799 (def :in-body start-tag-textarea (tokenizer
3800  in-body-process-space-characters-mode
3801  frameset-ok)
3802  (insert-element token)
3803  (setf (tokenizer-state tokenizer) :rcdata-state)
3804  (setf in-body-process-space-characters-mode :drop-newline)
3805  (setf frameset-ok nil)
3806  nil)
3807 
3808 (def :in-body start-tag-i-frame (frameset-ok)
3809  (setf frameset-ok nil)
3810  (start-tag-rawtext token)
3811  nil)
3812 
3813 (def :in-body start-tag-rawtext ()
3814  ;;;iframe, noembed noframes, noscript(if scripting enabled)
3815  (parse-rc-data-raw-text token :rawtext)
3816  nil)
3817 
3818 (def :in-body start-tag-opt (phase)
3819  (when (string= (node-name (last-open-element)) "option")
3820  (process-end-tag (implied-tag-token "option") :phase phase))
3821  (reconstruct-active-formatting-elements)
3822  (insert-element token)
3823  nil)
3824 
3825 (def :in-body start-tag-select (frameset-ok)
3826  (reconstruct-active-formatting-elements)
3827  (insert-element token)
3828  (setf frameset-ok nil)
3829  (if (member (parser-phase *parser*) '(:in-table :in-caption :in-column-group
3830  :in-table-body :in-row :in-cell))
3831  (setf (parser-phase *parser*) :in-select-in-table)
3832  (setf (parser-phase *parser*) :in-select))
3833  nil)
3834 
3835 (def :in-body start-tag-rp-rt ()
3836  (when (element-in-scope "ruby")
3837  (generate-implied-end-tags)
3838  (when (string/= (node-name (last-open-element)) "ruby")
3839  (perror :expected-ruby-tag)))
3840  (insert-element token)
3841  nil)
3842 
3843 (def :in-body start-tag-math (open-elements)
3844  (reconstruct-active-formatting-elements)
3845  (adjust-math-ml-attributes token)
3846  (adjust-foreign-attributes token)
3847  (setf (getf token :namespace) (find-namespace "mathml"))
3848  (insert-element token)
3849  ;;Need to get the parse error right for the case where the token
3850  ;;has a namespace not equal to the xmlns attribute
3851  (when (getf token :self-closing)
3852  (pop-end open-elements)
3853  (setf (getf token :self-closing-acknowledged) t))
3854  nil)
3855 
3856 (def :in-body start-tag-svg (open-elements)
3857  (reconstruct-active-formatting-elements)
3858  (adjust-svg-attributes token)
3859  (adjust-foreign-attributes token)
3860  (setf (getf token :namespace) (find-namespace "svg"))
3861  (insert-element token)
3862  ;;Need to get the parse error right for the case where the token
3863  ;;has a namespace not equal to the xmlns attribute
3864  (when (getf token :self-closing)
3865  (pop-end open-elements)
3866  (setf (getf token :self-closing-acknowledged) t))
3867  nil)
3868 
3869 (def :in-body start-tag-misplaced ()
3870  ;;; Elements that should be children of other elements that have a
3871  ;;; different insertion mode; here they are ignored
3872  ;;; "caption", "col", "colgroup", "frame", "frameset", "head",
3873  ;;; "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
3874  ;;; "tr", "noscript"
3875  (perror :unexpected-start-tag-ignored :name (getf token :name))
3876  nil)
3877 
3878 (def :in-body start-tag-other ()
3879  (reconstruct-active-formatting-elements)
3880  (insert-element token)
3881  nil)
3882 
3883 (def :in-body end-tag-p (open-elements)
3884  (cond ((not (element-in-scope "p" "button"))
3885  (start-tag-close-p (implied-tag-token "p" :start-tag))
3886  (perror :unexpected-end-tag :name "p")
3887  (end-tag-p (implied-tag-token "p")))
3888  (t
3889  (generate-implied-end-tags "p")
3890  (when (string/= (node-name (last-open-element)) "p")
3891  (perror :unexpected-end-tag :name "p"))
3892  (let ((node (pop-end open-elements)))
3893  (loop until (string= (node-name node) "p")
3894  do (setf node (pop-end open-elements))))))
3895  nil)
3896 
3897 (def :in-body end-tag-body (open-elements)
3898  (block nil
3899  (when (not (element-in-scope "body"))
3900  (perror :unexpected-scope)
3901  (return nil))
3902  (when (string/= (node-name (last-open-element)) "body")
3903  (loop for node in (cddr open-elements)
3904  do (when (member (node-name node)
3905  '("dd" "dt" "li" "optgroup" "option" "p" "rp"
3906  "rt" "tbody" "td" "tfoot" "th" "thead" "tr"
3907  "body" "html")
3908  :test #'string=)
3909  ;;Not sure this is the correct name for the parse error
3910  (perror :expected-one-end-tag-but-got-another
3911  :expected-name "body" :got-name (node-name node))
3912  (return)))))
3913  (setf (parser-phase *parser*) :after-body)
3914  nil)
3915 
3916 (def :in-body end-tag-html ()
3917  ;;We repeat the test for the body end tag token being ignored here
3918  (cond ((element-in-scope "body")
3919  (end-tag-body (implied-tag-token "body"))
3920  token)
3921  (t nil)))
3922 
3923 (def :in-body end-tag-block (in-body-process-space-characters-mode open-elements)
3924  ;;Put us back in the right whitespace handling mode
3925  (when (string= (getf token :name) "pre")
3926  (setf in-body-process-space-characters-mode :non-pre))
3927  (let ((in-scope (element-in-scope (getf token :name))))
3928  (when in-scope
3929  (generate-implied-end-tags))
3930  (when (string/= (node-name (last-open-element))
3931  (getf token :name))
3932  (perror :end-tag-too-early :name (getf token :name)))
3933  (when in-scope
3934  (let ((node (pop-end open-elements)))
3935  (loop until (string= (node-name node) (getf token :name))
3936  do (setf node (pop-end open-elements))))))
3937  nil)
3938 
3939 (def :in-body end-tag-form (form-pointer open-elements)
3940  (let ((node form-pointer))
3941  (setf form-pointer nil)
3942  (if (or (null node) (not (element-in-scope (node-name node))))
3943  (perror :unexpected-end-tag :name "form")
3944  (progn
3945  (generate-implied-end-tags)
3946  (when (not (equal (last-open-element) node))
3947  (perror :end-tag-too-early-ignored :name "form"))
3948  (setf open-elements
3949  (remove node open-elements)))))
3950  nil)
3951 
3952 ;;; Note to self:
3953 ;;; - A token is a plist.
3954 ;;; - A property is an alist.
3955 ;;; - A node is an object.
3956 ;;; - An element is a node.
3957 
3958 (def :in-body end-tag-list-item (open-elements)
3959  (let ((variant (if (string= (getf token :name) "li")
3960  "list"
3961  nil)))
3962  (if (not (element-in-scope (getf token :name) variant))
3963  (perror :unexpected-end-tag :name (getf token :name))
3964  (progn
3965  (generate-implied-end-tags (getf token :name))
3966  (when (string/= (node-name (last-open-element))
3967  (getf token :name))
3968  (perror :end-tag-too-early :name (getf token :name)))
3969  (let ((node (pop-end open-elements)))
3970  (loop until (string= (node-name node) (getf token :name))
3971  do (setf node (pop-end open-elements)))))))
3972  nil)
3973 
3974 (def :in-body end-tag-heading (open-elements)
3975  (loop for item in +heading-elements+
3976  do (when (element-in-scope item)
3977  (generate-implied-end-tags)
3978  (return)))
3979  (when (string/= (node-name (last-open-element))
3980  (getf token :name))
3981  (perror :end-tag-too-early :name (getf token :name)))
3982  (loop for item in +heading-elements+
3983  do (when (element-in-scope item)
3984  (let ((item (pop-end open-elements)))
3985  (loop until (member (node-name item) +heading-elements+
3986  :test #'string=)
3987  do (setf item (pop-end open-elements))))))
3988  nil)
3989 
3990 (defmacro insert-elt-at (object index place)
3991  (let ((tmp (gensym "TMP"))
3992  (object-symbol (gensym "OBJECT"))
3993  (index-symbol (gensym "INDEX")))
3994  `(let ((,object-symbol ,object)
3995  (,index-symbol ,index)
3996  (,tmp ,place))
3997  (setf ,place (append (subseq ,tmp 0 (min ,index-symbol (length ,tmp)))
3998  (list ,object-symbol)
3999  (nthcdr ,index-symbol ,tmp))))))
4000 
4001 (def :in-body end-tag-formatting (active-formatting-elements open-elements)
4002  ;; The much-feared adoption agency algorithm
4003  ;; http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
4004  ;; XXX Better parseError messages appreciated.
4005  (loop named outer
4006  with name = (getf token :name)
4007  with outer-loop-counter = 0
4008  with formatting-element
4009  with afe-index
4010  with furthest-block
4011  with bookmark
4012  with last-node
4013  with inner-loop-counter
4014  with index
4015  with node
4016  with common-ancestor
4017  with clone
4018  while (< outer-loop-counter 8)
4019  do
4020  (incf outer-loop-counter)
4021 
4022  ;; Step 1 paragraph 1
4023  (setf formatting-element
4024  (element-in-active-formatting-elements name))
4025  (cond ((or (not formatting-element)
4026  (and (member formatting-element
4027  open-elements)
4028  (not (element-in-scope
4029  (node-name formatting-element)))))
4030  (perror :adoption-agency-1.1 :name name)
4031  (return-from outer nil))
4032 
4033  ;; Step 1 paragraph 2
4034  ((not (member formatting-element
4035  open-elements))
4036  (perror :adoption-agency-1.2 :name name)
4037  (setf active-formatting-elements
4038  (remove formatting-element active-formatting-elements))
4039  (return-from outer nil)))
4040 
4041  ;; Step 1 paragraph 3
4042  (unless (eql formatting-element
4043  (last-open-element))
4044  (perror :adoption-agency-1.3 :name name))
4045 
4046 
4047  ;; Step 2
4048  ;; Start of the adoption agency algorithm proper
4049  (setf afe-index (position formatting-element
4050  open-elements))
4051  (setf furthest-block nil)
4052  (loop for element in (subseq open-elements
4053  afe-index)
4054  do (when (member (node-name-tuple element)
4055  +special-elements+
4056  :test #'equal)
4057  (setf furthest-block element)
4058  (return)))
4059  ;; Step 3
4060  (when (null furthest-block)
4061  (loop for element = (pop-end open-elements)
4062  until (eql formatting-element element)
4063  finally (setf active-formatting-elements
4064  (remove element
4065  active-formatting-elements)))
4066  (return-from outer nil))
4067  (setf common-ancestor (elt open-elements (- afe-index 1)))
4068 
4069  ;; Step 5
4070  ;;if furthestBlock.parent:
4071  ;; furthestBlock.parent.removeChild(furthestBlock)
4072 
4073  ;; Step 5
4074  ;; The bookmark is supposed to help us
4075  ;; identify where to reinsert nodes in step
4076  ;; 12. We have to ensure that we reinsert
4077  ;; nodes after the node before the active
4078  ;; formatting element. Note the bookmark can
4079  ;; move in step 7.4
4080  (setf bookmark (position formatting-element
4081  active-formatting-elements))
4082 
4083  ;; Step 6
4084  (setf node furthest-block)
4085  (setf last-node node)
4086  (setf inner-loop-counter 0)
4087 
4088  (setf index (position node open-elements))
4089  (loop named inner
4090  while (< inner-loop-counter 3)
4091  do
4092  (block continue
4093  (incf inner-loop-counter)
4094  ;; Node is element before node in open elements
4095  (decf index)
4096  (setf node (elt open-elements index))
4097  (when (not (member node active-formatting-elements))
4098  (setf open-elements
4099  (remove node open-elements))
4100  (return-from continue))
4101  ;; Step 6.3
4102  (when (eql node formatting-element)
4103  (return-from inner))
4104  ;; Step 6.4
4105  (when (eql last-node furthest-block)
4106  (setf bookmark (1+ (position node
4107  active-formatting-elements))))
4108  ;; Step 6.5
4109  (setf clone (node-clone* node))
4110  ;; Replace node with clone
4111  (symbol-macrolet
4112  ((af active-formatting-elements)
4113  (oe open-elements))
4114  (setf (elt af (position node af)) clone)
4115  (setf (elt oe (position node oe)) clone))
4116  (setf node clone)
4117 
4118  ;; Step 6.6
4119  ;; Remove lastNode from its parents, if any
4120  (when (node-parent last-node)
4121  (node-remove-child (node-parent last-node)
4122  last-node))
4123  (node-append-child node last-node)
4124 
4125  ;; Step 7.7
4126  (setf last-node node)
4127  ;; End of inner loop
4128  ))
4129 
4130  ;; Step 7
4131  ;; Foster parent lastNode if commonAncestor is a
4132  ;; table, tbody, tfoot, thead, or tr we need to
4133  ;; foster parent the lastNode
4134  (when (node-parent last-node)
4135  (node-remove-child (node-parent last-node)
4136  last-node))
4137 
4138  (if (member (node-name common-ancestor)
4139  '("table" "tbody" "tfoot" "thead" "tr")
4140  :test #'string=)
4141  (multiple-value-bind (parent insert-before)
4142  (get-table-misnested-nodeposition)
4143  (node-insert-before* parent last-node insert-before))
4144  (node-append-child* common-ancestor last-node))
4145 
4146  ;; Step 8
4147  (setf clone (node-clone* formatting-element))
4148 
4149  ;; Step 9
4150  (node-reparent-children furthest-block clone)
4151 
4152  ;; Step 10
4153  (node-append-child* furthest-block clone)
4154 
4155  ;; Step 11
4156  (setf active-formatting-elements
4157  (remove formatting-element
4158  active-formatting-elements))
4159  (insert-elt-at clone bookmark active-formatting-elements)
4160 
4161  ;; Step 12
4162  (setf open-elements
4163  (remove formatting-element
4164  open-elements))
4165  (insert-elt-at clone
4166  (1+ (position furthest-block
4167  open-elements))
4168  open-elements))
4169  nil)
4170 
4171 (def :in-body end-tag-applet-marquee-object (open-elements)
4172  (when (element-in-scope (getf token :name))
4173  (generate-implied-end-tags))
4174  (when (string/= (node-name (last-open-element))
4175  (getf token :name))
4176  (perror :end-tag-too-early :name (getf token :name)))
4177  (when (element-in-scope (getf token :name))
4178  (let ((element (pop-end open-elements)))
4179  (loop until (string= (node-name element) (getf token :name))
4180  do (setf element (pop-end open-elements))))
4181  (clear-active-formatting-elements))
4182  nil)
4183 
4184 (def :in-body end-tag-br (open-elements)
4185  (perror :unexpected-end-tag-treated-as
4186  :original-name "br" :new-name "br element")
4187  (reconstruct-active-formatting-elements)
4188  (insert-element (implied-tag-token "br" :start-tag))
4189  (pop-end open-elements)
4190  nil)
4191 
4192 (def :in-body end-tag-other (open-elements)
4193  (loop for node in (reverse open-elements)
4194  do (cond ((string= (node-name node) (getf token :name))
4195  (generate-implied-end-tags (getf token :name))
4196  (when (string/= (node-name (last-open-element))
4197  (getf token :name))
4198  (perror :unexpected-end-tag :name (getf token :name)))
4199  (loop while (not (eq node
4200  (pop-end open-elements))))
4201  (return))
4202  (t
4203  (when (member (node-name-tuple node) +special-elements+
4204  :test #'equal)
4205  (perror :unexpected-end-tag :name (getf token :name))
4206  (return)))))
4207  nil)
4208 
4209 
4210 ;; TextPhase
4211 
4212 (tagname-dispatch :text process-start-tag
4213  (default start-tag-other))
4214 
4215 (tagname-dispatch :text process-end-tag
4216  ("script" end-tag-script)
4217  (default end-tag-other))
4218 
4219 (def :text process-characters ()
4220  (parser-insert-text (getf token :data))
4221  nil)
4222 
4223 (def :text process-eof (phase original-phase open-elements)
4224  (perror :expected-named-closing-tag-but-got-eof
4225  (node-name (last-open-element)))
4226  (pop-end open-elements)
4227  (setf phase original-phase)
4228  t)
4229 
4230 (def :text start-tag-other ()
4231  (error "Tried to process start tag ~S in RCDATA/RAWTEXT mode" (getf token :name)))
4232 
4233 (def :text end-tag-script (phase original-phase open-elements)
4234  (assert (string= (node-name (pop-end open-elements))
4235  "script"))
4236  (setf phase original-phase)
4237  ;; The rest of this method is all stuff that only happens if
4238  ;; document.write works
4239  nil)
4240 
4241 (def :text end-tag-other (phase original-phase open-elements)
4242  (pop-end open-elements)
4243  (setf phase original-phase)
4244  nil)
4245 
4246 
4247 ;; InTablePhase
4248 ;; http://www.whatwg.org/specs/web-apps/current-work/#in-table
4249 
4250 (tagname-dispatch :in-table process-start-tag
4251  ("html" start-tag-html)
4252  ("caption" start-tag-caption)
4253  ("colgroup" start-tag-colgroup)
4254  ("col" start-tag-col)
4255  (("tbody" "tfoot" "thead") start-tag-row-group)
4256  (("td" "th" "tr") start-tag-imply-tbody)
4257  ("table" start-tag-table)
4258  (("style" "script") start-tag-style-script)
4259  ("input" start-tag-input)
4260  ("form" start-tag-form)
4261  (default start-tag-other))
4262 
4263 (tagname-dispatch :in-table process-end-tag
4264  ("table" end-Tag-Table)
4265  (("body" "caption" "col" "colgroup" "html" "tbody" "td"
4266  "tfoot" "th" "thead" "tr") end-Tag-Ignore)
4267  (default end-tag-other))
4268 
4269 (flet ((clear-stack-to-table-context ()
4270  ;; clear the stack back to a table context
4271  (loop until (member (node-name (last-open-element))
4272  '("table" "html")
4273  :test #'string=)
4274  do
4275  ;;(perror :unexpected-implied-end-tag-in-table
4276  ;; :name (node-name* (last-open-element)))
4277  (pop-end (slot-value *parser* 'open-elements)))
4278  ;; When the current node is <html> it's an innerHTML case
4279  ))
4280 
4281  (def :in-table process-eof (inner-html)
4282  (if (string/= (node-name (last-open-element)) "html")
4283  (perror :eof-in-table)
4284  (assert inner-html))
4285  ;; Stop parsing
4286  nil)
4287 
4288  (def :in-table process-space-characters (phase original-phase)
4289  (setf original-phase phase)
4290  (setf phase :in-table-text)
4291  (process-space-characters token :phase phase)
4292  nil)
4293 
4294  (def :in-table process-characters (phase original-phase)
4295  (setf original-phase phase)
4296  (setf phase :in-table-text)
4297  (process-characters token :phase phase)
4298  nil)
4299 
4300  (def :in-table insert-text (insert-from-table)
4301  ;; If we get here there must be at least one non-whitespace character
4302  ;; Do the table magic!
4303  (setf insert-from-table t)
4304  (process-characters token :phase :in-body)
4305  (setf insert-from-table nil)
4306  nil)
4307 
4308  (def :in-table start-tag-caption (phase active-formatting-elements)
4309  (clear-stack-to-table-context)
4310  (push-end :marker active-formatting-elements)
4311  (insert-element token)
4312  (setf phase :in-caption)
4313  nil)
4314 
4315  (def :in-table start-tag-colgroup (phase)
4316  (clear-stack-to-table-context)
4317  (insert-element token)
4318  (setf phase :in-column-group)
4319  nil)
4320 
4321  (def :in-table start-tag-col ()
4322  (start-tag-colgroup (implied-tag-token "colgroup" :start-tag))
4323  token)
4324 
4325  (def :in-table start-tag-row-group (phase)
4326  (clear-stack-to-table-context)
4327  (insert-element token)
4328  (setf phase :in-table-body)
4329  nil)
4330 
4331  (def :in-table start-tag-imply-tbody ()
4332  (start-tag-row-group (implied-tag-token "tbody" :start-tag))
4333  token)
4334 
4335  (def :in-table start-tag-table (phase inner-html)
4336  (perror :unexpected-start-tag-implies-end-tag
4337  :start-name "table"
4338  :end-name "table")
4339  (process-end-tag (implied-tag-token "table") :phase phase)
4340  (unless inner-html
4341  token))
4342 
4343  (def :in-table start-tag-style-script ()
4344  (process-start-tag token :phase :in-head))
4345 
4346  (def :in-table start-tag-input (open-elements)
4347  (let ((type (assoc "type" (getf token :data) :test #'string=)))
4348  (cond ((and type
4349  (string= (ascii-upper-2-lower (cdr type)) "hidden"))
4350  (perror :unexpected-hidden-input-in-table)
4351  (insert-element token)
4352  ;; XXX associate with form
4353  (pop-end open-elements))
4354  (t
4355  (start-tag-other token))))
4356  nil)
4357 
4358  (def :in-table start-tag-form (form-pointer open-elements)
4359  (perror :unexpected-form-in-table)
4360  (unless form-pointer
4361  (insert-element token)
4362  (setf form-pointer (last-open-element))
4363  (pop-end open-elements))
4364  nil)
4365 
4366  (def :in-table start-tag-other (insert-from-table)
4367  (perror :unexpected-start-tag-implies-table-voodoo :name (getf token :name))
4368  ;; Do the table magic!
4369  (setf insert-from-table t)
4370  (process-start-tag token :phase :in-body)
4371  (setf insert-from-table nil)
4372  nil)
4373 
4374  (def :in-table end-tag-table (inner-html open-elements)
4375  (cond ((element-in-scope "table" "table")
4376  (generate-implied-end-tags)
4377  (unless (equal (node-name (last-open-element)) "table")
4378  (perror :end-tag-too-early-named
4379  :got-name "table"
4380  :expected-name (node-name (last-open-element))))
4381  (loop until (equal (node-name (last-open-element)) "table")
4382  do (pop-end open-elements))
4383  (pop-end open-elements)
4384  (reset-insertion-mode))
4385  (t
4386  ;; innerHTML case
4387  (assert inner-html)
4388  (perror :end-tag-table-in-table-inner-html-case)))
4389  nil)
4390 
4391  (def :in-table end-tag-ignore ()
4392  (perror :unexpected-end-tag :name (getf token :name))
4393  nil)
4394 
4395  (def :in-table end-tag-other (insert-from-table)
4396  (perror :unexpected-end-tag-implies-table-voodoo :name (getf token :name))
4397  ;; Do the table magic!
4398  (setf insert-from-table t)
4399  (process-end-tag token :phase :in-body)
4400  (setf insert-from-table nil)
4401  nil))
4402 
4403 
4404 ;; InTableTextPhase
4405 
4406 (defun flush-characters ()
4407  (with-slots (character-tokens) *parser*
4408  (let ((data (apply #'concatenate 'string
4409  (loop for item in (reverse character-tokens)
4410  collect (getf item :data)))))
4411  (if (not (only-space-characters-p data))
4412  (insert-text (list :type :characters
4413  :data data)
4414  :phase :in-table)
4415  (parser-insert-text data)))
4416  (setf character-tokens nil)))
4417 
4418 (def :in-table-text process-comment (phase original-phase)
4419  (flush-characters)
4420  (setf phase original-phase)
4421  token)
4422 
4423 (def :in-table-text process-eof (phase original-phase)
4424  (flush-characters)
4425  (setf phase original-phase)
4426  t)
4427 
4428 (def :in-table-text process-characters (character-tokens)
4429  (unless (equal (getf token :data) (string #\u0000))
4430  (push token character-tokens))
4431  nil)
4432 
4433 (def :in-table-text process-space-characters (character-tokens)
4434  ;; pretty sure we should never reach here
4435  (push token character-tokens)
4436  nil)
4437 
4438 (def :in-table-text process-start-tag (phase original-phase)
4439  (flush-characters)
4440  (setf phase original-phase)
4441  token)
4442 
4443 (def :in-table-text process-end-tag (phase original-phase)
4444  (flush-characters)
4445  (setf phase original-phase)
4446  token)
4447 
4448 
4449 ;; InCaptionPhase
4450 ;; http://www.whatwg.org/specs/web-apps/current-work/#in-caption
4451 
4452 (tagname-dispatch :in-caption process-start-tag
4453  ("html" start-tag-html)
4454  (("caption" "col" "colgroup" "tbody" "td" "tfoot" "th"
4455  "thead" "tr") start-tag-table-element)
4456  (default start-tag-other))
4457 
4458 (tagname-dispatch :in-caption process-end-tag
4459  ("caption" end-tag-caption)
4460  ("table" end-tag-table)
4461  (("body" "col" "colgroup" "html" "tbody" "td" "tfoot" "th"
4462  "thead" "tr") end-tag-ignore)
4463  (default end-tag-other))
4464 
4465 (flet ((ignore-end-tag-caption ()
4466  (not (element-in-scope "caption" "table"))))
4467 
4468  (def :in-caption process-eof ()
4469  (process-eof token :phase :in-body))
4470 
4471  (def :in-caption process-characters ()
4472  (process-characters token :phase :in-body))
4473 
4474  (def :in-caption start-tag-table-element (phase)
4475  (perror :start-tag-table-element-in-caption)
4476  ;; XXX Have to duplicate logic here to find out if the tag is ignored
4477  (prog1 (unless (ignore-end-tag-caption)
4478  token)
4479  (process-end-tag (implied-tag-token "caption") :phase phase)))
4480 
4481  (def :in-caption start-tag-other ()
4482  (process-start-tag token :phase :in-body))
4483 
4484  (def :in-caption end-tag-caption (phase inner-html open-elements)
4485  (cond ((not (ignore-end-tag-caption))
4486  ;; AT this code is quite similar to endTagTable in "InTable"
4487  (generate-implied-end-tags)
4488  (unless (equal (node-name (last-open-element)) "caption")
4489  (perror :expected-one-end-tag-but-got-another
4490  :got-name "caption"
4491  :expected-name (node-name (last-open-element))))
4492  (loop until (equal (node-name (last-open-element)) "caption")
4493  do (pop-end open-elements))
4494  (clear-active-formatting-elements)
4495  (setf phase :in-table))
4496  (t
4497  ;; innerHTML case
4498  (assert inner-html)
4499  (perror :end-tag-caption-in-caption-inner-html-mode)))
4500  nil)
4501 
4502  (def :in-caption end-tag-table (phase)
4503  (perror :end-tag-table-in-caption)
4504  (prog1 (unless (ignore-end-tag-caption)
4505  token)
4506  (process-end-tag (implied-tag-token "caption") :phase phase)))
4507 
4508  (def :in-caption end-tag-ignore ()
4509  (perror :unexpected-end-tag :name (getf token :name))
4510  nil)
4511 
4512  (def :in-caption end-tag-other ()
4513  (process-end-tag token :phase :in-body)))
4514 
4515 
4516 ;; InColumnGroupPhase
4517 ;; http://www.whatwg.org/specs/web-apps/current-work/#in-column
4518 
4519 (tagname-dispatch :in-column-group process-start-tag
4520  ("html" start-tag-html)
4521  ("col" start-tag-col)
4522  (default start-tag-other))
4523 
4524 (tagname-dispatch :in-column-group process-end-tag
4525  ("colgroup" end-tag-colgroup)
4526  ("col" end-tag-col)
4527  (default end-tag-other))
4528 
4529 
4530 (flet ((ignore-end-tag-colgroup ()
4531  (string= (node-name (last-open-element)) "html")))
4532 
4533  (def :in-column-group process-eof (inner-html)
4534  (cond ((string= (node-name (last-open-element)) "html")
4535  (assert inner-html)
4536  nil)
4537  (t
4538  (let ((ignore-end-tag (ignore-end-tag-colgroup)))
4539  (end-tag-colgroup (implied-tag-token "colgroup"))
4540  (not ignore-end-tag)))))
4541 
4542  (def :in-column-group process-characters ()
4543  (prog1 (unless (ignore-end-tag-colgroup)
4544  token)
4545  (end-tag-colgroup (implied-tag-token "colgroup"))))
4546 
4547  (def :in-column-group start-tag-col (open-elements)
4548  (insert-element token)
4549  (pop-end open-elements)
4550  nil)
4551 
4552  (def :in-column-group start-tag-other ()
4553  (prog1 (unless (ignore-end-tag-colgroup)
4554  token)
4555  (end-tag-colgroup (implied-tag-token "colgroup"))))
4556 
4557  (def :in-column-group end-tag-colgroup (phase open-elements)
4558  (cond ((ignore-end-tag-colgroup)
4559  ;; innerHTML case
4560  (perror :end-tag-colgroup-in-column-group-inner-html-mode))
4561  (t
4562  (pop-end open-elements)
4563  (setf phase :in-table)))
4564  nil)
4565 
4566  (def :in-column-group end-tag-col ()
4567  (perror :no-end-tag :name "col")
4568  nil)
4569 
4570  (def :in-column-group end-tag-other ()
4571  (prog1 (unless (ignore-end-tag-colgroup)
4572  token)
4573  (end-tag-colgroup (implied-tag-token "colgroup")))))
4574 
4575 
4576 ;; InTableBodyPhase
4577 ;; http://www.whatwg.org/specs/web-apps/current-work/#in-table0
4578 
4579 (tagname-dispatch :in-table-body process-start-tag
4580  ("html" start-tag-html)
4581  ("tr" start-tag-tr)
4582  (("td" "th") start-tag-table-cell)
4583  (("caption" "col" "colgroup" "tbody" "tfoot" "thead") start-tag-table-other)
4584  (default start-tag-other))
4585 
4586 (tagname-dispatch :in-table-body process-end-tag
4587  (("tbody" "tfoot" "thead") end-Tag-Table-Row-Group)
4588  ("table" end-Tag-Table)
4589  (("body" "caption" "col" "colgroup" "html" "td" "th" "tr") end-Tag-Ignore)
4590  (default end-tag-other))
4591 
4592 (flet ((clear-stack-to-table-body-context ()
4593  (loop until (member (node-name (last-open-element))
4594  '("tbody" "tfoot" "thead" "html")
4595  :test #'string=)
4596  do
4597  ;;(perror :unexpected-implied-end-tag-in-table
4598  ;; :name (node-name (last-open-element)))
4599  (pop-end (slot-value *parser* 'open-elements)))
4600  (when (string= (node-name (last-open-element)) "html")
4601  (assert (slot-value *parser* 'inner-html)))))
4602 
4603  (def :in-table-body process-eof ()
4604  (process-eof token :phase :in-table))
4605 
4606  (def :in-table-body process-space-characters ()
4607  (process-space-characters token :phase :in-table))
4608 
4609  (def :in-table-body process-characters ()
4610  (process-characters token :phase :in-table))
4611 
4612  (def :in-table-body start-tag-tr (phase)
4613  (clear-stack-to-table-body-context)
4614  (insert-element token)
4615  (setf phase :in-row)
4616  nil)
4617 
4618  (def :in-table-body start-tag-table-cell ()
4619  (perror :unexpected-cell-in-table-body :name (getf token :name))
4620  (start-tag-tr (implied-tag-token "tr" :start-tag))
4621  token)
4622 
4623  (def :in-table-body start-tag-table-other (inner-html)
4624  ;; XXX AT Any ideas on how to share this with endTagTable?
4625  (cond ((or (element-in-scope "tbody" "table")
4626  (element-in-scope "thead" "table")
4627  (element-in-scope "tfoot" "table"))
4628  (clear-stack-to-table-body-context)
4629  (end-tag-table-row-group
4630  (implied-tag-token (node-name (last-open-element))))
4631  token)
4632  (t
4633  ;; innerHTML case
4634  (assert inner-html)
4635  (perror :start-tag-table-other-in-table-body-inner-html-mode)
4636  nil)))
4637 
4638  (def :in-table-body start-tag-other ()
4639  (process-start-tag token :phase :in-table))
4640 
4641  (def :in-table-body end-tag-table-row-group (phase open-elements)
4642  (cond ((element-in-scope (getf token :name) "table")
4643  (clear-stack-to-table-body-context)
4644  (pop-end open-elements)
4645  (setf phase :in-table))
4646  (t
4647  (perror :unexpected-end-tag-in-table-body :name (getf token :name))))
4648  nil)
4649 
4650  (def :in-table-body end-tag-table (inner-html)
4651  (cond ((or (element-in-scope "tbody" "table")
4652  (element-in-scope "thead" "table")
4653  (element-in-scope "tfoot" "table"))
4654  (clear-stack-to-table-body-context)
4655  (end-tag-table-row-group
4656  (implied-tag-token (node-name (last-open-element))))
4657  token)
4658  (t
4659  ;; innerHTML case
4660  (assert inner-html)
4661  (perror :end-tag-table-other-in-table-body-inner-html-mode)
4662  nil)))
4663 
4664  (def :in-table-body end-tag-ignore ()
4665  (perror :unexpected-end-tag-in-table-body :name (getf token :name))
4666  nil)
4667 
4668  (def :in-table-body end-tag-other ()
4669  (process-end-tag token :phase :in-table)))
4670 
4671 ;; InRowPhase
4672 ;; http://www.whatwg.org/specs/web-apps/current-work/#in-row
4673 
4674 (tagname-dispatch :in-row process-start-tag
4675  ("html" start-tag-html)
4676  (("td" "th") start-tag-table-cell)
4677  (("caption" "col" "colgroup" "tbody" "tfoot" "thead" "tr")
4678  start-tag-table-other)
4679  (default start-tag-other))
4680 
4681 (tagname-dispatch :in-row process-end-tag
4682  ("tr" end-tag-tr)
4683  ("table" end-tag-table)
4684  (("tbody" "tfoot" "thead") end-tag-table-row-group)
4685  (("body" "caption" "col" "colgroup" "html" "td" "th") end-tag-ignore)
4686  (default end-tag-other))
4687 
4688 
4689 ;; helper methods (XXX unify this with other table helper methods)
4690 (flet ((clear-stack-to-table-row-context ()
4691  (loop until (member (node-name (last-open-element))
4692  '("tr" "html")
4693  :test #'string=)
4694  do
4695  (perror :unexpected-implied-end-tag-in-table-row
4696  :name (node-name (last-open-element)))
4697  (pop-end (slot-value *parser* 'open-elements))))
4698 
4699  (ignore-end-tag-tr ()
4700  (not (element-in-scope "tr" "table"))))
4701 
4702  ;; the rest
4703  (def :in-row process-eof ()
4704  (process-eof token :phase :in-table)
4705  nil)
4706 
4707  (def :in-row process-space-characters ()
4708  (process-space-characters token :phase :in-table))
4709 
4710  (def :in-row process-characters ()
4711  (process-characters token :phase :in-table))
4712 
4713  (def :in-row start-tag-table-cell (phase active-formatting-elements)
4714  (clear-stack-to-table-row-context)
4715  (insert-element token)
4716  (setf phase :in-cell)
4717  (push-end :marker active-formatting-elements)
4718  nil)
4719 
4720  (def :in-row start-tag-table-other ()
4721  (let ((ignore-end-tag (ignore-end-tag-tr)))
4722  (end-tag-tr (implied-tag-token "tr"))
4723  ;; XXX how are we sure it's always ignored in the innerHTML case?
4724  (unless ignore-end-tag
4725  token)))
4726 
4727  (def :in-row start-tag-other ()
4728  (process-start-tag token :phase :in-table))
4729 
4730  (def :in-row end-tag-tr (phase inner-html open-elements)
4731  (cond ((not (ignore-end-tag-tr))
4732  (clear-stack-to-table-row-context)
4733  (pop-end open-elements)
4734  (setf phase :in-table-body))
4735  (t
4736  ;; innerHTML case
4737  (assert inner-html)
4738  (perror :end-tag-tr-inner-html-mode)))
4739  nil)
4740 
4741  (def :in-row end-tag-table ()
4742  (let ((ignore-end-tag (ignore-end-tag-tr)))
4743  (end-tag-tr (implied-tag-token "tr"))
4744  ;; Reprocess the current tag if the tr end tag was not ignored
4745  ;; XXX how are we sure it's always ignored in the innerHTML case?
4746  (unless ignore-end-tag
4747  token)))
4748 
4749  (def :in-row end-tag-table-row-group ()
4750  (cond ((element-in-scope (getf token :name) "table")
4751  (end-tag-tr (implied-tag-token "tr"))
4752  token)
4753  (t
4754  (perror :end-tag-table-row-group-something-wrong)
4755  nil)))
4756 
4757  (def :in-row end-tag-ignore ()
4758  (perror :unexpected-end-tag-in-table-row (getf token :name))
4759  nil)
4760 
4761  (def :in-row end-tag-other ()
4762  (process-end-tag token :phase :in-table)))
4763 
4764 
4765 ;; InCellPhase
4766 ;; http://www.whatwg.org/specs/web-apps/current-work/#in-cell
4767 
4768 (tagname-dispatch :in-cell process-start-tag
4769  ("html" start-tag-html)
4770  (("caption" "col" "colgroup" "tbody" "td" "tfoot" "th" "thead" "tr")
4771  start-tag-table-other)
4772  (default start-tag-other))
4773 
4774 (tagname-dispatch :in-cell process-end-tag
4775  (("td" "th") end-tag-table-cell)
4776  (("body" "caption" "col" "colgroup" "html") end-tag-ignore)
4777  (("table" "tbody" "tfoot" "thead" "tr") end-tag-imply)
4778  (default end-tag-other))
4779 
4780 (flet ((close-cell ()
4781  (if (element-in-scope "td" "table")
4782  (end-tag-table-cell (implied-tag-token "td"))
4783  (if (element-in-scope "th" "table")
4784  (end-tag-table-cell (implied-tag-token "th"))))))
4785 
4786  (def :in-cell process-eof ()
4787  (process-eof token :phase :in-body)
4788  nil)
4789 
4790  (def :in-cell process-characters ()
4791  (process-characters token :phase :in-body))
4792 
4793  (def :in-cell start-tag-table-other (inner-html)
4794  (cond ((or (element-in-scope "td" "table")
4795  (element-in-scope "th" "table"))
4796  (close-cell)
4797  token)
4798  (t
4799  ;; innerHTML case
4800  (assert inner-html)
4801  (perror :start-tag-table-other-in-inner-html-mode)
4802  nil)))
4803 
4804  (def :in-cell start-tag-other ()
4805  (process-start-tag token :phase :in-body))
4806 
4807  (def :in-cell end-tag-table-cell (phase open-elements)
4808  (cond ((element-in-scope (getf token :name) "table")
4809  (generate-implied-end-tags (getf token :name))
4810  (cond ((not (equal (node-name (last-open-element))
4811  (getf token :name)))
4812  (perror :unexpected-cell-end-tag :name (getf token :name))
4813  (loop until (equal (node-name (pop-end open-elements))
4814  (getf token :name))))
4815  (t
4816  (pop-end open-elements)))
4817  (clear-active-formatting-elements)
4818  (setf phase :in-row))
4819  (t
4820  (perror :unexpected-end-tag :name (getf token :name))))
4821  nil)
4822 
4823  (def :in-cell end-tag-ignore ()
4824  (perror :unexpected-end-tag :name (getf token :name))
4825  nil)
4826 
4827  (def :in-cell end-tag-imply ()
4828  (cond ((element-in-scope (getf token :name) "table")
4829  (close-cell)
4830  token)
4831  (t
4832  ;; sometimes innerHTML case
4833  (perror :end-tag-imply-sometimes-inner-html-case)
4834  nil)))
4835 
4836  (def :in-cell end-tag-other ()
4837  (process-end-tag token :phase :in-body)))
4838 
4839 
4840 ;; InSelectPhase
4841 
4842 (tagname-dispatch :in-select process-start-tag
4843  ("html" start-tag-html)
4844  ("option" start-tag-option)
4845  ("optgroup" start-tag-optgroup)
4846  ("select" start-tag-select)
4847  (("input" "keygen" "textarea") start-tag-input)
4848  ("script" start-tag-script)
4849  (default start-tag-other))
4850 
4851 (tagname-dispatch :in-select process-end-tag
4852  ("option" end-tag-option)
4853  ("optgroup" end-tag-optgroup)
4854  ("select" end-tag-select)
4855  (default end-tag-other))
4856 
4857 ;; http://www.whatwg.org/specs/web-apps/current-work/#in-select
4858 (def :in-select process-eof (inner-html)
4859  (if (not (equal (node-name (last-open-element)) "html"))
4860  (perror :eof-in-select)
4861  (assert inner-html))
4862  nil)
4863 
4864 (def :in-select process-characters ()
4865  (unless (equal (getf token :data) (string #\u0000))
4866  (parser-insert-text (getf token :data)))
4867  nil)
4868 
4869 (def :in-select start-tag-option (open-elements)
4870  ;; We need to imply </option> if <option> is the current node.
4871  (when (equal (node-name (last-open-element)) "option")
4872  (pop-end open-elements))
4873  (insert-element token)
4874  nil)
4875 
4876 (def :in-select start-tag-optgroup (open-elements)
4877  (when (equal (node-name (last-open-element)) "option")
4878  (pop-end open-elements))
4879  (when (equal (node-name (last-open-element)) "optgroup")
4880  (pop-end open-elements))
4881  (insert-element token)
4882  nil)
4883 
4884 (def :in-select start-tag-select ()
4885  (perror :unexpected-select-in-select)
4886  (end-tag-select (implied-tag-token "select"))
4887  nil)
4888 
4889 (def :in-select start-tag-input (inner-html)
4890  (perror :unexpected-input-in-select)
4891  (cond ((element-in-scope "select" "select")
4892  (end-tag-select (implied-tag-token "select"))
4893  token)
4894  (t
4895  (assert inner-html)
4896  nil)))
4897 
4898 (def :in-select start-tag-script ()
4899  (process-start-tag token :phase :in-head))
4900 
4901 (def :in-select start-tag-other ()
4902  (perror :unexpected-start-tag-in-select :name (getf token :name))
4903  nil)
4904 
4905 (def :in-select end-tag-option (open-elements)
4906  (if (equal (node-name (last-open-element)) "option")
4907  (pop-end open-elements)
4908  (perror :unexpected-end-tag-in-select :name (getf token :name)))
4909  nil)
4910 
4911 (def :in-select end-tag-optgroup (open-elements)
4912  ;; </optgroup> implicitly closes <option>
4913  (when (and (equal (node-name (last-open-element)) "option")
4914  (equal (node-name (elt open-elements
4915  (- (length open-elements) 2)))
4916  "optgroup"))
4917  (pop-end open-elements))
4918  ;; It also closes </optgroup>
4919  (if (equal (node-name (last-open-element)) "optgroup")
4920  (pop-end open-elements)
4921  ;; But nothing else
4922  (perror :unexpected-end-tag-in-select :name (getf token :name)))
4923  nil)
4924 
4925 (def :in-select end-tag-select (inner-html open-elements)
4926  (cond ((element-in-scope "select" "select")
4927  (loop until (equal (node-name (pop-end open-elements))
4928  "select"))
4929  (reset-insertion-mode))
4930  (t
4931  ;; innerHTML case
4932  (assert inner-html)
4933  (perror :end-tag-select-in-inner-html-mode)))
4934  nil)
4935 
4936 (def :in-select end-tag-other ()
4937  (perror :unexpected-end-tag-in-select :name (getf token :name))
4938  nil)
4939 
4940 
4941 ;; InSelectInTablePhase
4942 
4943 (tagname-dispatch :in-select-in-table process-start-tag
4944  (("caption" "table" "tbody" "tfoot" "thead" "tr" "td" "th") start-tag-table)
4945  (default start-tag-other))
4946 
4947 (tagname-dispatch :in-select-in-table process-end-tag
4948  (("caption" "table" "tbody" "tfoot" "thead" "tr" "td" "th") end-tag-table)
4949  (default end-tag-other))
4950 
4951 (def :in-select-in-table process-eof ()
4952  (process-eof token :phase :in-select)
4953  nil)
4954 
4955 (def :in-select-in-table process-characters ()
4956  (process-characters token :phase :in-select))
4957 
4958 (def :in-select-in-table start-tag-table ()
4959  (perror :unexpected-table-element-start-tag-in-select-in-table :name (getf token :name))
4960  (end-tag-other (implied-tag-token "select"))
4961  token)
4962 
4963 (def :in-select-in-table start-tag-other ()
4964  (process-start-tag token :phase :in-select))
4965 
4966 (def :in-select-in-table end-tag-table ()
4967  (perror :unexpected-table-element-end-tag-in-select-in-table :name (getf token :name))
4968  (cond ((element-in-scope (getf token :name) "table")
4969  (end-tag-other (implied-tag-token "select"))
4970  token)
4971  (t
4972  nil)))
4973 
4974 (def :in-select-in-table end-tag-other ()
4975  (process-end-tag token :phase :in-select))
4976 
4977 
4978 ;; InForeignContentPhase
4979 
4980 (defparameter +breakout-elements+
4981  '("b" "big" "blockquote" "body" "br"
4982  "center" "code" "dd" "div" "dl" "dt"
4983  "em" "embed" "h1" "h2" "h3"
4984  "h4" "h5" "h6" "head" "hr" "i" "img"
4985  "li" "listing" "menu" "meta" "nobr"
4986  "ol" "p" "pre" "ruby" "s" "small"
4987  "span" "strong" "strike" "sub" "sup"
4988  "table" "tt" "u" "ul" "var"))
4989 
4990 
4991 (defun adjust-svg-tag-names (token)
4992  (let ((replacement (cdr
4993  (assoc (getf token :name)
4994  '(("altglyph" . "altGlyph")
4995  ("altglyphdef" . "altGlyphDef")
4996  ("altglyphitem" . "altGlyphItem")
4997  ("animatecolor" . "animateColor")
4998  ("animatemotion" . "animateMotion")
4999  ("animatetransform" . "animateTransform")
5000  ("clippath" . "clipPath")
5001  ("feblend" . "feBlend")
5002  ("fecolormatrix" . "feColorMatrix")
5003  ("fecomponenttransfer" . "feComponentTransfer")
5004  ("fecomposite" . "feComposite")
5005  ("feconvolvematrix" . "feConvolveMatrix")
5006  ("fediffuselighting" . "feDiffuseLighting")
5007  ("fedisplacementmap" . "feDisplacementMap")
5008  ("fedistantlight" . "feDistantLight")
5009  ("feflood" . "feFlood")
5010  ("fefunca" . "feFuncA")
5011  ("fefuncb" . "feFuncB")
5012  ("fefuncg" . "feFuncG")
5013  ("fefuncr" . "feFuncR")
5014  ("fegaussianblur" . "feGaussianBlur")
5015  ("feimage" . "feImage")
5016  ("femerge" . "feMerge")
5017  ("femergenode" . "feMergeNode")
5018  ("femorphology" . "feMorphology")
5019  ("feoffset" . "feOffset")
5020  ("fepointlight" . "fePointLight")
5021  ("fespecularlighting" . "feSpecularLighting")
5022  ("fespotlight" . "feSpotLight")
5023  ("fetile" . "feTile")
5024  ("feturbulence" . "feTurbulence")
5025  ("foreignobject" . "foreignObject")
5026  ("glyphref" . "glyphRef")
5027  ("lineargradient" . "linearGradient")
5028  ("radialgradient" . "radialGradient")
5029  ("textpath" . "textPath"))
5030  :test #'string=))))
5031  (when replacement
5032  (setf (getf token :name) replacement))))
5033 
5034 
5035 (defparameter +only-space-characters-regexp+
5036  (cl-ppcre:create-scanner `(:sequence :start-anchor
5037  (:greedy-repetition
5038  0 nil
5039  (:alternation ,@(coerce +space-characters+ 'list)))
5040  :end-anchor)
5041  :multi-line-mode t))
5042 
5043 (defun only-space-characters-p (string)
5044  (cl-ppcre:scan +only-space-characters-regexp+ string))
5045 
5046 (def :in-foreign-content process-characters (frameset-ok)
5047  (cond ((equal (getf token :data) (string #\u0000))
5048  (setf (getf token :data) (string #\uFFFD)))
5049  ((and frameset-ok
5050  (not (only-space-characters-p (getf token :data))))
5051  (setf frameset-ok nil)))
5052  (process-characters token :phase nil)
5053  nil)
5054 
5055 (def :in-foreign-content process-start-tag (html-namespace open-elements)
5056  (block nil
5057  (let ((current-node (last-open-element)))
5058  (cond ((or (member (getf token :name) +breakout-elements+ :test #'string=)
5059  (and (string= (getf token :name) "font")
5060  (intersection (mapcar #'car (getf token :data))
5061  '("color" "face" "size")
5062  :test #'string=)))
5063  (parser-parse-error :unexpected-html-element-in-foreign-content
5064  (getf token :name))
5065  (loop until (or (is-html-integration-point (last-open-element))
5066  (is-math-ml-text-integration-point (last-open-element))
5067  (equal (node-namespace (last-open-element))
5068  html-namespace))
5069  do (pop-end open-elements))
5070  (return token))
5071  (t
5072  (cond ((equal (node-namespace current-node) (find-namespace "mathml"))
5073  (adjust-math-ml-attributes token))
5074  ((equal (node-namespace current-node) (find-namespace "svg"))
5075  (adjust-svg-tag-names token)
5076  (adjust-svg-attributes token)))
5077  (adjust-foreign-attributes token)
5078  (setf (getf token :namespace) (node-namespace current-node))
5079  (insert-element token)
5080  (when (getf token :self-closing)
5081  (pop-end open-elements)
5082  (setf (getf token :self-closing-acknowledged) t)))))
5083  nil))
5084 
5085 (def :in-foreign-content process-end-tag (phase original-phase html-namespace open-elements)
5086  (let ((new-token)
5087  (node-index (1- (length open-elements)))
5088  (node (last-open-element)))
5089  (unless (string= (node-name node) (getf token :name))
5090  (parser-parse-error :unexpected-end-tag (getf token :name)))
5091 
5092  (loop
5093  (when (string= (ascii-upper-2-lower (node-name node)) (getf token :name))
5094  ;; XXX this isn't in the spec but it seems necessary
5095  (when (eql phase :in-table-text)
5096  (flush-characters)
5097  (setf phase original-phase))
5098  (loop until (eql (pop-end open-elements) node)
5099  do (assert open-elements))
5100  (setf new-token nil)
5101  (return))
5102  (decf node-index)
5103 
5104  (setf node (elt open-elements node-index))
5105  (when (equal (node-namespace node)
5106  html-namespace)
5107  (setf new-token (process-end-tag token :phase phase))
5108  (return)))
5109  new-token))
5110 
5111 ;; AfterBodyPhase
5112 
5113 (tagname-dispatch :after-body process-start-tag
5114  ("html" start-tag-html)
5115  (default start-tag-other))
5116 
5117 (tagname-dispatch :after-body process-end-tag
5118  ("html" end-tag-html)
5119  (default end-tag-other))
5120 
5121 (def :after-body process-eof ()
5122  ;; Stop parsing
5123  nil)
5124 
5125 (def :after-body process-comment (open-elements)
5126  ;; This is needed because data is to be appended to the <html> element
5127  ;; here and not to whatever is currently open.
5128  (insert-comment token (first open-elements))
5129  nil)
5130 
5131 (def :after-body process-characters (phase)
5132  (parser-parse-error :unexpected-char-after-body)
5133  (setf phase :in-body)
5134  token)
5135 
5136 (def :after-body start-tag-html ()
5137  (process-start-tag token :phase :in-body))
5138 
5139 (def :after-body start-tag-other (phase)
5140  (parser-parse-error :unexpected-start-tag-after-body
5141  `(:name ,(getf token :name)))
5142  (setf phase :in-body)
5143  token)
5144 
5145 (def :after-body end-tag-html (inner-html phase)
5146  (if inner-html
5147  (parser-parse-error :unexpected-end-tag-after-body-innerhtml)
5148  (setf phase :after-after-body))
5149  nil)
5150 
5151 (def :after-body end-tag-other (phase)
5152  (parser-parse-error :unexpected-end-tag-after-body
5153  `(:name ,(getf token :name)))
5154  (setf phase :in-body)
5155  token)
5156 
5157 ;; InFramesetPhase
5158 
5159 (tagname-dispatch :in-frameset process-start-tag
5160  ("html" start-tag-html)
5161  ("frameset" start-tag-frameset)
5162  ("frame" start-tag-frame)
5163  ("noframes"start-tag-noframes)
5164  (default start-tag-other))
5165 
5166 (tagname-dispatch :in-frameset process-end-tag
5167  ("frameset" end-tag-frameset)
5168  (default end-tag-other))
5169 
5170 
5171 (def :in-frameset process-eof (inner-html)
5172  (if (string/= (node-name (last-open-element)) "html")
5173  (parser-parse-error :eof-in-frameset)
5174  (assert inner-html))
5175  nil)
5176 
5177 (def :in-frameset process-characters ()
5178  (parser-parse-error :unexpected-char-in-frameset)
5179  nil)
5180 
5181 (def :in-frameset start-tag-frameset ()
5182  (insert-element token)
5183  nil)
5184 
5185 (def :in-frameset start-tag-frame (open-elements)
5186  (insert-element token)
5187  (pop-end open-elements)
5188  nil)
5189 
5190 (def :in-frameset start-tag-noframes ()
5191  (process-start-tag token :phase :in-body))
5192 
5193 (def :in-frameset start-tag-other ()
5194  (parser-parse-error :unexpected-start-tag-in-frameset
5195  `(:name ,(getf token :name)))
5196  nil)
5197 
5198 (def :in-frameset end-tag-frameset (phase inner-html open-elements)
5199  (if (string= (node-name (last-open-element)) "html")
5200  ;; innerHTML case
5201  (parser-parse-error :unexpected-frameset-in-frameset-innerhtml)
5202  (pop-end open-elements))
5203 
5204  (when (and (not inner-html)
5205  (string/= (node-name (last-open-element)) "frameset"))
5206  ;; If we're not in innerHTML mode and the the current node is not a
5207  ;; "frameset" element (anymore) then switch.
5208  (setf phase :after-frameset))
5209  nil)
5210 
5211 (def :in-frameset end-tag-other ()
5212  (parser-parse-error :unexpected-end-tag-in-frameset
5213  `(:name ,(getf token :name)))
5214  nil)
5215 
5216 
5217 ;; AfterFramesetPhase
5218 
5219 (tagname-dispatch :after-frameset process-start-tag
5220  ("html" start-tag-html)
5221  ("noframes" start-tag-noframes)
5222  (default start-tag-other))
5223 
5224 (tagname-dispatch :after-frameset process-end-tag
5225  ("html" end-tag-html)
5226  (default end-tag-other))
5227 
5228 (def :after-frameset process-eof ()
5229  ;; Stop parsing
5230  nil)
5231 
5232 (def :after-frameset process-characters ()
5233  (parser-parse-error :unexpected-char-after-frameset)
5234  nil)
5235 
5236 (def :after-frameset start-tag-noframes ()
5237  (process-start-tag token :phase :in-head))
5238 
5239 (def :after-frameset start-tag-other ()
5240  (parser-parse-error :unexpected-start-tag-after-frameset
5241  `(:name ,(getf token :name)))
5242  nil)
5243 
5244 (def :after-frameset end-tag-html (phase)
5245  (setf phase :after-after-frameset)
5246  nil)
5247 
5248 (def :after-frameset end-tag-other ()
5249  (parser-parse-error :unexpected-end-tag-after-frameset
5250  `(:name ,(getf token :name)))
5251  nil)
5252 
5253 ;; AfterAfterBodyPhase
5254 
5255 (tagname-dispatch :after-after-body process-start-tag
5256  ("html" start-tag-html)
5257  (default start-tag-other))
5258 
5259 (def :after-after-body process-eof ()
5260  nil)
5261 
5262 (def :after-after-body process-comment ()
5263  (insert-comment token (document*))
5264  nil)
5265 
5266 (def :after-after-body process-space-characters ()
5267  (process-space-characters token :phase :in-body))
5268 
5269 (def :after-after-body process-characters (phase)
5270  (parser-parse-error :expected-eof-but-got-char)
5271  (setf phase :in-body)
5272  token)
5273 
5274 (def :after-after-body start-tag-html ()
5275  (process-start-tag token :phase :in-body))
5276 
5277 (def :after-after-body start-tag-other (phase)
5278  (parser-parse-error :expected-eof-but-got-start-tag
5279  `(:name (getf token :name)))
5280  (setf phase :in-body)
5281  token)
5282 
5283 (def :after-after-body process-end-tag (phase)
5284  (parser-parse-error :expected-eof-but-got-end-tag
5285  `(:name (getf token :name)))
5286  (setf phase :in-body)
5287  token)
5288 
5289 ;; AfterAfterFramesetPhase
5290 
5291 (tagname-dispatch :after-after-frameset process-start-tag
5292  ("html" start-tag-html)
5293  ("noframes" start-tag-noframes)
5294  (default start-tag-other))
5295 
5296 (def :after-after-frameset process-eof ()
5297  nil)
5298 
5299 (def :after-after-frameset process-comment ()
5300  (insert-comment token (document*))
5301  nil)
5302 
5303 (def :after-after-frameset process-space-characters ()
5304  (process-space-characters token :phase :in-body))
5305 
5306 (def :after-after-frameset process-characters ()
5307  (parser-parse-error :expected-eof-but-got-char)
5308  nil)
5309 
5310 (def :after-after-frameset start-tag-html ()
5311  (process-start-tag token :phase :in-body))
5312 
5313 (def :after-after-frameset start-tag-noframes ()
5314  (process-start-tag token :phase :in-head))
5315 
5316 (def :after-after-frameset start-tag-other ()
5317  (parser-parse-error :expected-eof-but-got-start-tag
5318  `(:name (getf token :name)))
5319  nil)
5320 
5321 (def :after-after-frameset process-end-tag ()
5322  (parser-parse-error :expected-eof-but-got-end-tag
5323  `(:name (getf token :name)))
5324  nil)
5325 
5326 ;;; toxml
5327 (defun xml-escape-name (name)
5328  "Escapes a node name (element, attribute, doctype) by replacing any
5329 character not valid in XML name by Uxxxxxx, where x is the code point
5330 as six hex digits. This encoding is reversable, since the HTML parser
5331 down cases all characters in names.
5332 
5333 See: https://www.w3.org/TR/html5/syntax.html#coercing-an-html-dom-into-an-infoset"
5334  (if (and (xml-name-start-char-p (char name 0))
5335  (every #'xml-name-char-p name))
5336  name
5337  (with-output-to-string (out)
5338  (loop for first = t then nil
5339  for c across name do
5340  (if (if first
5341  (xml-name-start-char-p c)
5342  (xml-name-char-p c))
5343  (princ c out)
5344  (format out "U~:@(~6,'0X~)" (char-code c)))))))
5345 
5346 
5347 (defun xml-unescape-name (name)
5348  "Reverert escaping done by xml-unescape-name."
5349  (cl-ppcre:regex-replace-all
5350  "U[0-9A-F]{6}"
5351  name
5352  (lambda (u)
5353  (string (code-char (parse-integer u :start 1 :radix 16))))
5354  :simple-calls t))
5355 
5356 
5357 (defun xml-name-start-char-p (c)
5358  (or (char<= #\a c #\z)
5359  (char= #\_ c)
5360  (char<= #\A c #\Z)
5361  (char<= (code-char #xC0) c (code-char #xD6))
5362  (char<= (code-char #xD8) c (code-char #xF6))
5363  (char<= (code-char #xF8) c (code-char #x2FF))
5364  (char<= (code-char #x370) c (code-char #x37D))
5365  (char<= (code-char #x37F) c (code-char #x1FFF))
5366  (char<= (code-char #x200C) c (code-char #x200D))
5367  (char<= (code-char #x2070) c (code-char #x218F))
5368  (char<= (code-char #x2C00) c (code-char #x2FEF))
5369  (char<= (code-char #x3001) c (code-char #xD7FF))
5370  (char<= (code-char #xF900) c (code-char #xFDCF))
5371  (char<= (code-char #xFDF0) c (code-char #xFFFD))
5372  (char<= (code-char #x10000) c (code-char #xEFFFF))))
5373 
5374 
5375 (defun xml-name-char-p (c)
5376  (or (xml-name-start-char-p c)
5377  (char= #\- c)
5378  (char= #\. c)
5379  (char<= #\0 c #\9)
5380  (char= (code-char #xB7) c)
5381  (char<= (code-char #x0300) c (code-char #x036F))
5382  (char<= (code-char #x203F) c (code-char #x2040))))
5383 
5384 ;;; XML DOM
5385 
5386 (defmethod transform-html5-dom ((to-type (eql :xmls)) node
5387  &key namespace comments)
5388  "Convert a node into an XMLS-compatible tree of conses, starting
5389 at. If the node is a document-fragement a list of XMLS trees is returned."
5390  (labels ((node-to-xmls (node parent-ns xlink-defined)
5391  (ecase (node-type node)
5392  (:document
5393  (let (root)
5394  (element-map-children (lambda (n)
5395  (when (string= (node-name n) "html")
5396  (setf root n)))
5397  node)
5398  (assert root)
5399  (node-to-xmls root parent-ns xlink-defined)))
5400  (:document-fragment
5401  (let (xmls-nodes)
5402  (element-map-children (lambda (node)
5403  (push (node-to-xmls node parent-ns xlink-defined)
5404  xmls-nodes))
5405  node)
5406  (nreverse xmls-nodes)))
5407  (:element
5408  (let (attrs children)
5409  (element-map-attributes (lambda (name node-namespace value)
5410  (when (and namespace
5411  (not xlink-defined)
5412  (equal node-namespace (find-namespace "xlink")))
5413  (push '#.(list "xmlns:xlink" (find-namespace "xlink")) attrs)
5414  (setf xlink-defined t))
5415  (push (list (if node-namespace
5416  name
5417  (xml-escape-name name))
5418  value)
5419  attrs))
5420  node)
5421  (element-map-children (lambda (c)
5422  (push c children))
5423  node)
5424 
5425  (apply #'list
5426  (if (and namespace
5427  (not (equal parent-ns (node-namespace node))))
5428  (cons (node-name node) (node-namespace node))
5429  (xml-escape-name (node-name node)))
5430  attrs
5431  (mapcar (lambda (c)
5432  (node-to-xmls c (node-namespace node) xlink-defined))
5433  (nreverse children)))))
5434  (:text
5435  (node-value node))
5436  (:comment
5437  (when comments
5438  (list :comment nil (node-value node)))))))
5439  (node-to-xmls node nil nil)))
5440 
5441 
5442 (defmethod transform-html5-dom ((to-type (eql :xmls-ns)) node &key)
5443  (transform-html5-dom :xmls node :namespace t))