changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / lisp/lib/obj/uri/mask.lisp

changeset 275: 78ef6145e272
author: Richard Westhaver <ellis@rwest.io>
date: Fri, 12 Apr 2024 18:41:40 -0400
permissions: -rw-r--r--
description: return of the uri
1 ;;; obj/uri/mask.lisp --- string character masks for parsing
2 
3 ;;
4 
5 ;;; Code:
6 (in-package :obj/uri)
7 ;; To match sets of characters, the parser uses bit vectors constructed
8 ;; from lists of characters.
9 ;; The size of bit vectors are defined to check for characters in the
10 ;; range 0 to 126 (~). We use location 0 and 1, which are never set by
11 ;; any generated character list, as boolean.
12 (eval-always
13 (defparameter +uri-bit-vector-size+ 127)
14 ;; The is the index at which we store the boolean: does this bitvector
15 ;; allow `ucschar' (from the grammar)?
16 (defparameter +bitvector-index-ucschar+ 0)
17 ;; The is the index at which we store the boolean: does this bitvector
18 ;; allow `iprivate' (from the grammar)?
19 (defparameter +bitvector-index-iprivate+ 1)
20 
21 (defun generate-character-list (char-start char-end)
22  ;; Generate a list of characters between char-start and char-end,
23  ;; inclusive of the start and end characters.
24  (when (>= (char-code char-start) (char-code char-end))
25  (error "char-start (~s) must come before char-end (~s)."
26  char-start char-end))
27  ;; Make sure it doesn't index off the end of the array:
28  (when (>= (char-code char-end) +uri-bit-vector-size+)
29  (error "Illegal char-code (>= ~d)." +uri-bit-vector-size+))
30  (do* ((stop-code (1- (char-code char-start)))
31  (c (char-code char-end) (1- c))
32  (res '()))
33  ((= c stop-code) res)
34  (push (code-char c) res))))
35 
36 (defmacro char-included-p (bit-vector char-code)
37  `(= 1 (sbit ,bit-vector ,char-code)))
38 
39 (defmacro safe-char-included-p (bit-vector char-code)
40  (let ((g-bv (gensym))
41  (g-cc (gensym)))
42  `(let* ((,g-bv ,bit-vector)
43  (,g-cc ,char-code))
44  (or (null ,g-bv)
45  (and (< ,g-cc +uri-bit-vector-size+)
46  (char-included-p ,g-bv ,g-cc))))))
47 
48 (defun make-char-bitvector (chars &key except iri)
49  ;; Return a bitvector which has a 1 for each character represented in
50  ;; CHARS, where the index is the char-code of the character. If EXCEPT
51  ;; is non-nil, it should be a list of characters to exclude.
52  ;;
53  ;; If IRI is non-nil, it should be either :ucschar or :iprivate.
54  ;; Since the first two bits of the bitvector returned by this function
55  ;; are unused (those characters are invalid for URIs and IRIs), we use
56  ;; those bits for IRI validation. During IRI character validation,
57  ;; characters outside the ASCII range are validated with either ucscharp
58  ;; or iprivatep. IRI mode is indicated by .iri-mode. having a non-nil
59  ;; value.
60  (do* ((a (make-array +uri-bit-vector-size+
61  :element-type 'bit :initial-element 0))
62  (chars chars (cdr chars))
63  (c (car chars) (car chars)))
64  ((null chars)
65  (when iri
66  ;; set the booleans for this bitvector, used in .looking-at
67  (ecase iri
68  (:ucschar (setf (sbit a #.+bitvector-index-ucschar+) 1))
69  (:iprivate (setf (sbit a #.+bitvector-index-iprivate+) 1))))
70  a)
71  (if* (and except (member c except :test #'eq))
72  thenret
73  else (setf (sbit a (char-code c)) 1))))
74 
75 ;; Lists of characters used to make the bit vectors. These lists are
76 ;; pretty much straight out of the grammars.
77 (defparameter *alpha-chars*
78  '#.(append (generate-character-list #\A #\Z)
79  (generate-character-list #\a #\z)))
80 
81 (defparameter *digit-chars* '#.(generate-character-list #\0 #\9))
82 
83 (defparameter *hexdig-chars*
84  (append *digit-chars*
85  '#.(generate-character-list #\A #\F)
86  '#.(generate-character-list #\a #\f)))
87 
88 (defparameter *alphanum-chars* (append *alpha-chars* *digit-chars*))
89 (defparameter *alphanum+-chars* (append *alphanum-chars* '(#\-)))
90 
91 (defparameter *sub-delims-chars* '(#\! #\$ #\& #\' #\( #\) #\* #\+ #\, #\; #\=))
92 
93 (defparameter *unreserved-chars*
94  (append *alpha-chars* *digit-chars* '(#\- #\. #\_ #\~)))
95 
96 (defparameter *pchar-chars*
97  (append *unreserved-chars* *sub-delims-chars* '(#\: #\@)))
98 
99 ;; used in pathname to URI conversion:
100 (defparameter *pchar/-chars* (append *pchar-chars* '(#\/)))
101 
102 (defparameter *urn-nss-chars* (append *pchar-chars* '(#\/)))
103 
104 (defparameter *segment-nz-nc-chars* ;; pchar w/o #\:
105  (append *unreserved-chars* *sub-delims-chars* '(#\@)))
106 
107 (defparameter *query-strict-chars* (append *pchar-chars* '(#\/ #\?)))
108 (defparameter *urn-query-chars* (append *pchar-chars* '(#\/)))
109 (defparameter *fragment-strict-chars* (append *pchar-chars* '(#\/ #\?)))
110 
111 (defparameter *ipvfuture-chars*
112  (append *unreserved-chars* *sub-delims-chars* '(#\:)))
113 
114 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
115 
116 (defparameter *alpha-bitvector* (make-char-bitvector *alpha-chars*))
117 (defparameter *digit-bitvector* (make-char-bitvector *digit-chars*))
118 (defparameter *alphanum-bitvector* (make-char-bitvector *alphanum-chars*))
119 (defparameter *alphanum+-bitvector* (make-char-bitvector *alphanum+-chars*))
120 (defparameter *hexdig-bitvector* (make-char-bitvector *hexdig-chars*))
121 (defparameter *pchar-bitvector* (make-char-bitvector *pchar-chars*
122  :iri :ucschar))
123 (defparameter *urn-nss-bitvector* (make-char-bitvector *urn-nss-chars*
124  :iri :ucschar))
125 (defparameter *unreserved-bitvector* (make-char-bitvector *unreserved-chars*
126  :iri :ucschar))
127 
128 ;; used in pathname to URI conversion:
129 (defparameter *pchar/-bitvector* (make-char-bitvector *pchar/-chars*
130  :iri :ucschar))
131 
132 (defparameter *userinfo-bitvector*
133  (make-char-bitvector
134  (append *unreserved-chars* *sub-delims-chars* '(#\:))
135  :iri :ucschar))
136 
137 (defparameter *reg-name-bitvector*
138  (make-char-bitvector (append *unreserved-chars* *sub-delims-chars*)
139  :iri :ucschar))
140 
141 (defparameter *scheme-bitvector*
142  (make-char-bitvector (append *alpha-chars* *digit-chars* '(#\+ #\- #\.))))
143 
144 (defparameter *query-bitvector-strict*
145  (make-char-bitvector *query-strict-chars*
146  :iri :iprivate))
147 
148 (defparameter *query-bitvector-non-strict*
149  (make-char-bitvector (append *query-strict-chars*
150  '(#\| #\^
151  ;; Too many websites/tools use this in URLs
152  #\space))
153  :iri :iprivate))
154 
155 ;;;;;;;;; HACK
156 ;; See discussion in rfe15844. Decoding the query should not touch percent
157 ;; encodings of #\+, #\= and #\&, because those are interpreted by
158 ;; another specification (HTTP).
159 
160 (defparameter *decode-query-strict-chars*
161  (append *unreserved-chars*
162  ;; Instead of *sub-delims-chars*, this (which is just like
163  ;; *sub-delims-chars*, except for the commented out characters):
164  '(#\! #\$ #\' #\( #\) #\* #\, #\;
165  ;;#\& #\+ #\=
166  )
167  '(#\: #\@)))
168 
169 (defparameter *decode-query-bitvector-strict*
170  (make-char-bitvector *decode-query-strict-chars* :iri :iprivate))
171 
172 (defparameter *decode-query-bitvector-non-strict*
173  (make-char-bitvector
174  (append *decode-query-strict-chars*
175  '(#\| #\^
176  ;; Too many websites/tools use this in URLs
177  #\space))
178  :iri :iprivate))
179 ;;;;;;;;; ...HACK
180 
181 (defparameter *fragment-bitvector-strict*
182  (make-char-bitvector *fragment-strict-chars* :iri :ucschar))
183 
184 (defparameter *fragment-bitvector-non-strict*
185  (make-char-bitvector
186  (append *fragment-strict-chars*
187  '(#\#
188  ;; Too many websites/tools use these in URLs
189  #\space #\|))
190  :iri :ucschar))
191 
192 (defparameter *segment-nz-nc-bitvector*
193  (make-char-bitvector *segment-nz-nc-chars* :iri :ucschar))
194 
195 (defparameter *urn-query-bitvector*
196  ;; Not sure which to use, :ucschar or :iprivate. The universe will
197  ;; probably end before anyone figures it out.
198  (make-char-bitvector *urn-query-chars* :iri :iprivate))
199 
200 (defparameter *ipvfuture-bitvector*
201  (make-char-bitvector *ipvfuture-chars* :iri :ucschar))
202 
203 ;; The part of a URI that can have percent encoding:
204 ;; - userinfo
205 ;; - host
206 ;; - path
207 ;; - query
208 ;; - fragment
209 
210 (defun percent-decode-string (string allowed-bitvector)
211  ;; Return a new string based on STRING which has all percent encoded
212  ;; pairs (%xx) turned into real characters. If ALLOWED-BITVECTOR is
213  ;; non-nil, only characters that `match' this bitvector are converted.
214  ;; (declare (type string string))
215  (do* ((i 0 (1+ i))
216  (max (length string))
217  (new-string (make-string max))
218  (new-i 0 (1+ new-i))
219  ch ch2 chc chc2)
220  ((= i max)
221  ;; (nyi! "was formerly a call to EXCL package") - shrinks new-string vector to fit size?
222  (remove #\Nul new-string))
223  (declare (fixnum i max new-i))
224  (if* (char= #\% (setq ch (schar string i)))
225  then (when (> (+ i 3) max)
226  (error "Unsyntactic percent encoding at ~d in ~s." i string))
227  (setq ch (schar string (incf i)))
228  (setq ch2 (schar string (incf i)))
229  (when (not (and (setq chc (digit-char-p ch 16))
230  (setq chc2 (digit-char-p ch2 16))))
231  (error
232  "Non-hexidecimal digits after % at ~d in ~s."
233  (- i 2) string))
234  (let ((ci (the fixnum
235  (+ (the fixnum (* 16 (the fixnum chc)))
236  (the fixnum chc2)))))
237  (declare (fixnum ci))
238  (if* (safe-char-included-p allowed-bitvector ci)
239  then ;; OK to convert
240  (setf (schar new-string new-i)
241  (code-char ci))
242  else ;; leave percent encoded
243  (setf (schar new-string new-i) #\%)
244  (setf (schar new-string (incf new-i)) ch)
245  (setf (schar new-string (incf new-i)) ch2)))
246  else (setf (schar new-string new-i) ch))))
247 
248 ;; This is experimental work in progress.
249 #+ignore
250 (defun percent-decode-utf8-string (string allowed-bitvector)
251  ;; like percent-decode-string, but handle UTF-8 encoded sequences
252 ;;;; chars 0..127 use allowed-bitvector
253 ;;;; chars > 127 use RFC 3629 grammar
254  (do* ((i 0 (1+ i))
255  (max (length string))
256  (new-string (make-string max))
257  (new-i 0 (1+ new-i))
258  ch ch2 chc chc2
259  (state :start)
260  (vec (make-array 4 :element-type '(unsigned-byte 8)))
261  (temps (make-string 1 :element-type 'character))
262  (veci 0))
263  ((= i max)
264  (excl::.primcall 'sys::shrink-svector new-string new-i)
265  new-string)
266  (declare (fixnum i max new-i veci)
267  (type (simple-array (unsigned-byte 8) (4)) vec)
268  (dynamic-extent vec))
269  (cond
270  ((char= #\% (setq ch (schar string i)))
271  (when (> (+ i 3) max)
272  (excl::.parse-error
273  "Unsyntactic percent encoding at ~d in ~s." i string))
274  (setq ch (schar string (incf i)))
275  (setq ch2 (schar string (incf i)))
276  (when (not (and (setq chc (digit-char-p ch 16))
277  (setq chc2 (digit-char-p ch2 16))))
278  (excl::.parse-error
279  "Non-hexidecimal digits after % at ~d in ~s."
280  (- i 2) string))
281  (let ((cc (the fixnum
282  (+ (the fixnum (* 16 (the fixnum chc)))
283  (the fixnum chc2)))))
284  (declare (fixnum cc))
285  (cond
286  ((<= cc #.+uri-bit-vector-size+)
287  (if* (char-included-p allowed-bitvector cc)
288  then ;; OK to convert
289  (setf (schar new-string new-i)
290  (code-char cc))
291  else ;; leave percent encoded
292  (setf (schar new-string new-i) #\%)
293  (setf (schar new-string (incf new-i)) ch)
294  (setf (schar new-string (incf new-i)) ch2)))
295  (t
296  ;; check for valid UTF-8 encoding (from RFC 2234):
297 ;;;; UTF8-octets = *( UTF8-char )
298 ;;;; UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
299 ;;;; UTF8-1 = %x00-7F
300 ;;;; UTF8-2 = %xC2-DF UTF8-tail
301 ;;;; UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
302 ;;;; %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
303 ;;;; UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
304 ;;;; %xF4 %x80-8F 2( UTF8-tail )
305 ;;;; UTF8-tail = %x80-BF
306  ;; We have a little FSM here. `state' can be one of:
307  ;; :start :: looking for markers for UTF8-{2,3,4}
308  ;; :utf8-3a :: have UTF8-3, read %E0, look for %xA0-BF
309  ;; :utf8-3b :: have UTF8-3, read %ED, look for %x80-9F
310  ;; :utf8-4a :: have UTF8-4, read %F0, look for %x90-BF
311  ;; :utf8-4b :: have UTF8-4, read %F4, look for %x80-8F
312  ;; :utf8-tail3 :: look for 3( UTF8-tail )
313  ;; :utf8-tail2 :: look for 2( UTF8-tail )
314  ;; :utf8-tail1 :: look for 1( UTF8-tail )
315  (case state
316  (:start
317 ;;;; UTF8-2
318  (if* (<= #xC2 cc #xDF)
319  then (setf (aref vec 0) cc)
320  (setq veci 1)
321  (setq state :utf8-tail1)
322 ;;;; UTF8-3
323  elseif (= #xE0 cc)
324  then (setf (aref vec 0) cc)
325  (setq veci 1)
326  (setq state :utf8-3a)
327  elseif (or (<= #xE1 cc #xEC)
328  (<= #xEE cc #xEF))
329  then (setf (aref vec 0) cc)
330  (setq veci 1)
331  (setq state :utf8-tail2)
332  elseif (= #xED cc)
333  then (setf (aref vec 0) cc)
334  (setq veci 1)
335  (setq state :utf8-3b)
336 ;;;; UTF8-4
337  elseif (= #xF0 cc)
338  then (setf (aref vec 0) cc)
339  (setq veci 1)
340  (setq state :utf8-4a)
341  elseif (<= #xF1 cc #xF3)
342  then (setf (aref vec 0) cc)
343  (setq veci 1)
344  (setq state :utf8-tail3)
345  elseif (= #xF4 cc)
346  then (setf (aref vec 0) cc)
347  (setq veci 1)
348  (setq state :utf8-4b)
349  else (excl::.parse-error
350 ;;;;TODO:
351  "invalid UTF-8 encoding...FIXME")))
352  (:utf8-3a
353  (if* (<= #xA0 cc #xBF)
354  then (setf (aref vec veci) cc)
355  (incf veci)
356  (setq state :utf8-tail1)
357  else (error "invalid UTF8-3 2nd byte: ~x" cc)))
358  (:utf8-3b
359  (if* (<= #x80 cc #x9F)
360  then (setf (aref vec veci) cc)
361  (incf veci)
362  (setq state :utf8-tail3)
363  else (error "invalid UTF8-3 2nd byte: ~x" cc)))
364  (:utf8-4a
365  (if* (<= #x90 cc #xBF)
366  then (setf (aref vec veci) cc)
367  (incf veci)
368  (setq state :utf8-tail2)
369  else (error "invalid UTF8-4 2nd byte: ~x" cc)))
370  (:utf8-4b
371  (if* (<= #x80 cc #x8F)
372  then (setf (aref vec veci) cc)
373  (incf veci)
374  (setq state :utf8-tail2)
375  else (error "invalid UTF8-4 2nd byte: ~x" cc)))
376  (:utf8-tail3
377  (if* (<= #x80 cc #xBF)
378  then (setf (aref vec veci) cc)
379  (incf veci)
380  (setq state :utf8-tail2)))
381  (:utf8-tail2
382  (if* (<= #x80 cc #xBF)
383  then (setf (aref vec veci) cc)
384  (incf veci)
385  (setq state :utf8-tail1)))
386  (:utf8-tail1
387  (if* (<= #x80 cc #xBF)
388  then (setf (aref vec veci) cc)
389  (setq state :done)))
390  (:done
391  (octets-to-string vec :external-format :utf-8
392  :end veci :string temps)
393  (setf (schar new-string new-i) (char temps 0)))
394  (t (error "internal error: bad state: ~s" state)))))))
395  (t
396  (setq state :start)
397  (setf (schar new-string new-i) ch)))))
398 
399 (defun percent-encode-string (string allowed-bitvector)
400  ;; Return a new string based on STRING which has all characters which do
401  ;; not match ALLOWED-BITVECTOR converted into percent encoded pairs (%xx).
402  ;; Percent-encoded pairs in the string are skipped over, as it is assumed
403  ;; they were required to be encoded.
404  ;;
405  ;; Make a string as big as it possibly needs to be (3 times the original
406  ;; size), and truncate it at the end.
407  ;; (declare (type string string))
408  (declare (optimize (safety 1)))
409  ;;(declare (:explain :calls :types))
410  (do* ((hexchars ;; RFC 3986 section 2.1 says use upper case:
411  "0123456789ABCDEF")
412  (pct (char-code #\%))
413  (max (length string))
414  (new-max (* 3 max)) ;; worst case new size
415  (new-string (make-string new-max))
416  (i 0 (1+ i))
417  (new-i -1)
418  (ci ;; so the fixnum decl is true:
419  0)
420  c)
421  ((= i max)
422  ;; is it safe to delete all 0 chars here?
423  ;; (nyi! "was previously a call to EXCL")
424  (remove #\Nul new-string))
425  (declare (fixnum pct max new-max i new-i ci))
426  (setq ci (char-code (setq c (schar string i))))
427  (if* (or (= ci pct) ;; skip %'s
428  (safe-char-included-p allowed-bitvector ci))
429  then ;; ok as is
430  (incf new-i)
431  (setf (schar new-string new-i) c)
432  else ;; need to escape it
433  (let ((d1 (ash ci -4))
434  (d2 (logand ci #xf)))
435  (declare (fixnum d1 d2))
436  (setf (schar new-string (incf new-i)) #\%)
437  (setf (schar new-string (incf new-i)) (schar hexchars d1))
438  (setf (schar new-string (incf new-i)) (schar hexchars d2))))))
439 
440 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
441 
442 ;; For efficiency, we do as few subseq's as possible. To achieve this, we
443 ;; return, from various parser functions, the start/end pair encoded into a
444 ;; fixnum. This means the limit for a URI string is limited to 16384 on a
445 ;; 32-bit Lisp. It appears from searches that this is well above the
446 ;; accepted maximum for URI strings.
447 
448 (eval-always
449  ;; The max array index is 1/2 of the available fixnum range.
450  (defparameter +uri-max-string-length+
451  #.(expt 2 (truncate (integer-length most-positive-fixnum) 2)))
452  (defparameter +uri-pack-shift+
453  #.(truncate (integer-length most-positive-fixnum) 2))
454  (defparameter +uri-unpack-shift+
455  #.(- (truncate (integer-length most-positive-fixnum) 2)))
456  (defparameter +uri-unpack-mask+
457  #.(1- (ash 1 (truncate (integer-length most-positive-fixnum) 2))))
458 
459  ;; This is used as a marker for the null string. It must be a fixnum
460  ;; that can't be returned as an index into a string.
461  (defparameter *uri-null-marker* -1)
462  )
463 
464 (defun check-xri-string (string)
465  ;; Make sure that:
466  ;; 1. STRING is a simple string, and
467  ;; 2. Two indices into STRING can packed into a single fixnum.
468  ;; This is what xsubseq/val do.
469  (or (stringp string)
470  (error "string must be a simple string."))
471  (or (< (length string) #.+uri-max-string-length+)
472  (error "string is larger than ~d characters."
473  #.+uri-max-string-length+)))
474 
475 (defun xsubseq (start end)
476  ;; Encode START and END into a fixnum.
477  (declare (fixnum start end) (optimize (safety 1)))
478  (the fixnum
479  (+ start (the fixnum
480  (ash end +uri-pack-shift+)))))
481 
482 (defun val (string i)
483  ;; Return the subsequence of STRING given by I, which was encoded with
484  ;; XSUBSEQ.
485  (declare (type (or fixnum null) i) (optimize (safety 1)))
486  (when i
487  (cond
488  ((= i *uri-null-marker*) "")
489  (t (let ((start (the fixnum (logand i +uri-unpack-mask+)))
490  (end (the fixnum
491  (ash i +uri-unpack-shift+))))
492  (declare (fixnum start end))
493  (if* (simple-string-p string)
494  then ;; This is a good bit faster than calling subseq
495  (do* ((len (the fixnum (- end start)))
496  (res (make-string len))
497  (src-index start (the fixnum (1+ src-index)))
498  (dst-index 0 (the fixnum (1+ dst-index))))
499  ((= src-index end) res)
500  (declare (fixnum len src-index dst-index))
501  (setf (schar res dst-index) (schar string src-index)))
502  else (subseq string start end)))))))
503 
504 (defun at-end-p (i end)
505  ;; return T if index I is beyond the END of the string
506  (>= i end))
507 
508 ;; This macro is very specialized and not hygenic. It is built for pure
509 ;; speed.
510 (defmacro .looking-at (simple thing string index end char-equal)
511  ;; INDEX and END are declared FIXNUM by our caller.
512  ;; SIMPLE-STRING-P and SCHAR are much faster than STRINGP and CHAR.
513  ;; For the details of what this function returns, see looking-at below.
514  (let ((stringp (if simple 'simple-string-p 'stringp))
515  (schar (if simple 'schar 'char))
516  ;; TODO
517  (length (if simple 'sequence:length 'length))
518  (len (gensym))
519  (i (gensym))
520  (j (gensym))
521  (x (gensym))
522  (c (gensym)))
523  `(let ((,len 0))
524  (declare (fixnum ,len))
525  (if* (at-end-p ,index ,end)
526  then nil
527  elseif (characterp ,thing)
528  then ;; In this case, we ignore CHAR-EQUAL and always do the
529  ;; character comparison with CHAR= (case sensitively).
530  (when (char= ,thing (,schar ,string ,index))
531  (the fixnum (1+ ,index)))
532  elseif (,stringp ,thing)
533  then (when (not (at-end-p
534  (+ ,index
535  (setq ,len (the fixnum (,length ,thing))))
536  ,end))
537  (do* ((,i ,index (the fixnum (1+ ,i)))
538  (,j 0 (the fixnum (1+ ,j)))
539  (,x ,len (the fixnum (1- ,x))))
540  ((= 0 ,x) (+ ,index ,len))
541  (declare (fixnum ,i ,j ,x))
542  (if* ,char-equal
543  then (when (not (char-equal (,schar ,string ,i)
544  (,schar ,thing ,j)))
545  (return nil))
546  else (when (not (char= (,schar ,string ,i)
547  (,schar ,thing ,j)))
548  (return nil)))))
549  elseif (simple-bit-vector-p ,thing) ;; a LOT faster than bit-vector-p
550  then (let ((,c (char-code (,schar ,string ,index))))
551  (if* (< ,c +uri-bit-vector-size+)
552  then (when (char-included-p ,thing ,c)
553  (the fixnum (1+ ,index)))
554  elseif (and %iri-mode
555  (or
556  ;; If the ucschar or iprivate booleans are set,
557  ;; then check for characters in those ranges.
558  (and (= 1 (sbit ,thing #.+bitvector-index-ucschar+))
559  (ucscharp ,c))
560  (and (= 1 (sbit ,thing #.+bitvector-index-iprivate+))
561  (iprivatep ,c))))
562  then (the fixnum (1+ ,index))))
563  else (error "bad object: ~s." ,thing)))))
564 
565 (defun ucscharp (code)
566  (declare (type fixnum code) (optimize (safety 1)))
567  ;; This is straight from the grammer in RFC 3987, for ucschar.
568  (or (<= #x000A0 code #x0D7FF)
569  (<= #x0F900 code #x0FDCF)
570  (<= #x0FDF0 code #x0FFEF)
571  (<= #x10000 code #x1FFFD)
572  (<= #x20000 code #x2FFFD)
573  (<= #x30000 code #x3FFFD)
574  (<= #x40000 code #x4FFFD)
575  (<= #x50000 code #x5FFFD)
576  (<= #x60000 code #x6FFFD)
577  (<= #x70000 code #x7FFFD)
578  (<= #x80000 code #x8FFFD)
579  (<= #x90000 code #x9FFFD)
580  (<= #xA0000 code #xAFFFD)
581  (<= #xB0000 code #xBFFFD)
582  (<= #xC0000 code #xCFFFD)
583  (<= #xD0000 code #xDFFFD)
584  (<= #xE1000 code #xEFFFD)))
585 
586 (defun iprivatep (code)
587  (declare (fixnum code) (optimize (safety 1)))
588  ;; This is straight from the grammer in RFC 3987, for iprivate.
589  (or (<= #x00E000 code #x00F8FF)
590  (<= #x0F0000 code #x0FFFFD)
591  (<= #x100000 code #x10FFFD)))
592 
593 ;; Future optimization from rfr:
594 ;; If THING is going to be a string very often,
595 ;; then you might get a useful speed improvement by splitting this
596 ;; again based on char-equal true/false. As it is, you're generating
597 ;; code in .looking-at that checks the char-equal argument on every
598 ;; character.
599 (defun looking-at (thing string index end
600  ;; optional because it is rarely given
601  &optional char-equal)
602  ;; Return a new index into the parse buffer (STRING), if
603  ;; an object equivalent to THING exists at index INDEX.
604  ;; THING can be a:
605  ;; - bit vector: if a bit vector, then check that at character
606  ;; code index for it, there is a `1'
607  ;; - string: check that the string is in STRING starting at INDEX
608  ;; - character: check that the character is in STRING starting at
609  ;; INDEX
610  ;; If CHAR-EQUAL is non-nil, then do character comparisons
611  ;; case insensitively with CHAR-EQUAL.
612  (declare (type fixnum index end) (optimize (safety 1)))
613  ;; The simple-string version is much faster, so this is worth the
614  ;; complexity.
615  ;;
616  ;; NOTE: .looking-at takes ONLY symbols. The macro is not hygenic.
617  (if* (simple-string-p string)
618  then (.looking-at t thing string index end char-equal)
619  else (.looking-at nil thing string index end char-equal)))
620 
621 (defun scan-forward (string start end bitvector
622  &optional func)
623  ;; Scan STRING using BITVECTOR for matching, starting from position
624  ;; START, and going no farther than END.
625  ;; Return the index of the first non-matching character, or nil if no
626  ;; characters matched.
627  ;;
628  ;; If BITVECTOR does not match, then call FUNC with three arguments
629  ;; (STRING, <index>, and END). If the FUNC returns nil, then scanning
630  ;; terminates and this function returns <index>, if it is > START.
631  (declare (type fixnum start end)
632  (type (or function null) func)
633  (optimize (safety 1)))
634  (do ((i start)
635  (new-i nil))
636  ((= end i)
637  (if* (= i start)
638  then nil
639  else i))
640  (declare (fixnum i))
641  (cond
642  ((looking-at bitvector string i end)
643  ;; Advance
644  (incf i))
645  (func
646  ;; BITVECTOR failed.
647  (if* (setq new-i (funcall func string i end))
648  then ;; FUNC return non-nil, advance I and keep going...
649  (setq i new-i)
650  else ;; FUNC return NIL, we're done
651  (if* (= i start)
652  then ;; Nothing matched => NIL:
653  (return nil)
654  else ;; Something matched => first index that didn't:
655  (return i))))
656  (t
657  ;; BITVECTOR didn't match. We're done.
658  (if* (= i start)
659  then ;; Nothing matched:
660  (return nil)
661  else ;; Something matched, first index that didn't:
662  (return i))))))