changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / lisp/lib/obj/uri/state.lisp

changeset 698: 96958d3eb5b0
parent: 78ef6145e272
author: Richard Westhaver <ellis@rwest.io>
date: Fri, 04 Oct 2024 22:04:59 -0400
permissions: -rw-r--r--
description: fixes
1 ;;; obj/uri/state.lisp --- Parser state
2 
3 ;;
4 
5 ;;; Code:
6 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
7 ;; A note about parser naming conventions.
8 ;; There are two types of functions, where <name> comes from the LHS
9 ;; of the ABNF grammar:
10 ;; state-<name> :: scan and return values based on the parse. The
11 ;; first value is always the "next" index beyond the parse.
12 ;; The subsequent values are rule specific, and documented in
13 ;; the functions themselves.
14 ;; scan-<name> :: scan for and return either nil or an index. If
15 ;; there is match, return the "next" index beyond the match,
16 ;; and nil otherwise.
17 ;;
18 ;; Rules marked `TERMINAL' must check for `at-end-p', since they must
19 ;; terminate the parse for the input to be valid.
20 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
21 (in-package :obj/uri)
22 
23 (defun state-uri (string start end
24  &aux i scheme userinfo host port path query fragment
25  nid nss q-component f-component r-component i2
26  colon urn-scheme file-scheme)
27  ;; rule 01: URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
28  ;; --TERMINAL--
29  ;; values: i scheme userinfo host port path query fragment
30  (if* (and (multiple-value-setq (i scheme) (state-scheme string start end))
31  (setq colon (looking-at #\: string i end))
32  (not (setq urn-scheme (looking-at "urn" string start end t)))
33  (not (setq file-scheme (looking-at "file" string start end t)))
34  (multiple-value-setq (i2 userinfo host port path)
35  (state-hier-part string (1+ i) end)))
36  then ;; Have hier-part...
37  (setq i i2)
38  (when (at-end-p i end)
39  (return-from state-uri
40  (values i scheme userinfo host port path)))
41 
42  (when (looking-at #\? string i end)
43  (if* (multiple-value-setq (i2 query)
44  (state-query string (incf i) end))
45  then (setq i i2)
46  else (setq query #.*uri-null-marker*)))
47 
48  (when (looking-at #\# string i end)
49  (if* (multiple-value-setq (i2 fragment)
50  (state-fragment string (incf i) end))
51  then (setq i i2)
52  else (setq fragment #.*uri-null-marker*)))
53 
54  (when (at-end-p i end)
55  (values i scheme userinfo host port path query fragment))
56  elseif urn-scheme
57  then ;; values: i "urn" nid r-component nil nss q-component f-component
58  (when (multiple-value-setq (i nid nss q-component f-component
59  r-component)
60  (state-urn-namestring string i end))
61  (values i
62  scheme
63  r-component ;userinfo
64  nid ;host
65  nil ;port
66  nss ;path
67  q-component ;query
68  f-component ;fragment
69  ))
70  elseif (and file-scheme
71  (multiple-value-setq (i path)
72  (state-uri-file string colon end)))
73  then (values i scheme nil nil nil path)
74  elseif (and scheme colon)
75  then ;; Something like "mailto:foo@bar.com". Put the
76  ;; the non-scheme part into the path
77  (values end scheme nil nil nil (xsubseq colon end))))
78 
79 ;; called by parse-uri-string-rfc3986
80 (defun state-uri-reference (string start end
81  &aux i scheme userinfo host port path query
82  fragment)
83  ;; rule 02: URI-reference = URI / relative-ref
84  ;; values: i scheme host userinfo port path query fragment
85  (if* (multiple-value-setq (i scheme userinfo host port path query
86  fragment)
87  (state-uri string start end))
88  then (values i scheme userinfo host port path query fragment)
89  elseif (multiple-value-setq (i userinfo host port path query fragment)
90  (state-relative-ref string start end))
91  then (values i nil userinfo host port path query fragment)))
92 
93 ;; called by parse-uri-string-rfc3986
94 (defun state-absolute-uri (string start end
95  &aux i scheme userinfo host port path query i2
96  colon urn-scheme file-scheme)
97  ;; rule 03: absolute-URI = scheme ":" hier-part [ "?" query ]
98  ;; --TERMINAL--
99  ;; values: i scheme userinfo host port path query
100  (if* (and (multiple-value-setq (i scheme) (state-scheme string start end))
101  (setq colon (looking-at #\: string i end))
102  (not (setq urn-scheme (looking-at "urn" string start end t)))
103  (not (setq file-scheme (looking-at "file" string start end t)))
104  (multiple-value-setq (i2 userinfo host port path)
105  (state-hier-part string colon end)))
106  then ;; so far: scheme + ":" + hier-part
107  (setq i i2)
108  (if* (at-end-p i end)
109  then (values i scheme userinfo host port path)
110  elseif (and (looking-at #\? string i end)
111  (multiple-value-setq (i query)
112  (state-query string (incf i) end))
113  (at-end-p i end))
114  then (values i scheme userinfo host port path query))
115  elseif urn-scheme
116  then ;; values: i "urn" nid r-component nil nss q-component f-component
117  (multiple-value-bind (i3 nid nss q-component f-component r-component)
118  (state-urn-namestring string (incf i) end)
119  (when i3
120  (values i3
121  scheme
122  r-component ;userinfo
123  nid ;host
124  nil ;port
125  nss ;path
126  q-component ;query
127  f-component ;fragment
128  )))
129  elseif (and file-scheme
130  (multiple-value-setq (i path)
131  (state-uri-file string colon end)))
132  then (values i scheme nil nil nil path)
133  elseif (and scheme colon)
134  then ;; Something like "mailto:foo@bar.com". Put the
135  ;; the non-scheme part into the path
136  (values end scheme nil nil nil (xsubseq colon end))))
137 
138 (defun state-hier-part (string start end &aux i userinfo host port
139  path i2)
140  ;; rule 04: hier-part = "//" authority path-abempty
141  ;; / "//" path-absolute ***NEW***
142  ;; / path-absolute
143  ;; / path-rootless
144  ;; / path-empty
145  ;; values: i userinfo host port path
146  (if* (and (setq i (looking-at "//" string start end))
147  (multiple-value-setq (i userinfo host port)
148  (state-authority string i end)))
149  then (if* (multiple-value-setq (i2 path) (state-path-abempty string i end))
150  then (values i2 userinfo host port path)
151  else (values i userinfo host port))
152  elseif (and (setq i (looking-at "//" string start end))
153  (multiple-value-setq (i path)
154  (state-path-absolute string i end)))
155  then (values i nil nil nil path)
156  elseif (or
157  (multiple-value-setq (i path) (state-path-absolute string start end))
158  (multiple-value-setq (i path) (state-path-rootless string start end))
159  (multiple-value-setq (i path) (state-path-empty string start end)))
160  then (values i nil nil nil path)))
161 
162 (defun state-relative-ref (string start end &aux i2 query fragment)
163  ;; rule 05: relative-ref = relative-part [ "?" query ] [ "#" fragment ]
164  ;; --TERMINAL--
165  ;; values: i userinfo host port path query fragment
166  (multiple-value-bind (i userinfo host port path)
167  (state-relative-part string start end)
168  (when i
169  (if* (at-end-p i end)
170  then (values i userinfo host port path)
171  else (when (looking-at #\? string i end)
172  (if* (multiple-value-setq (i2 query)
173  (state-query string (incf i) end))
174  then (setq i i2)
175  else (setq query #.*uri-null-marker*)))
176 
177  (when (looking-at #\# string i end)
178  (if* (multiple-value-setq (i2 fragment)
179  (state-fragment string (incf i) end))
180  then (setq i i2)
181  else (setq fragment #.*uri-null-marker*)))
182 
183  (when (at-end-p i end)
184  (values i userinfo host port path query fragment))))))
185 
186 (defun state-relative-part (string start end
187  &aux (i start) path userinfo host port i2)
188  ;; rule 06: relative-part = "//" authority path-abempty
189  ;; / path-absolute
190  ;; / path-noscheme
191  ;; / path-empty
192  ;; values: i userinfo host port path
193  (if* (and (setq i (looking-at "//" string i end))
194  (multiple-value-setq (i userinfo host port)
195  (state-authority string i end)))
196  then (if* (multiple-value-setq (i2 path) (state-path-abempty string i end))
197  then (values i2 userinfo host port path)
198  else (values i userinfo host port))
199  elseif (or
200  (multiple-value-setq (i path) (state-path-absolute string start end))
201  (multiple-value-setq (i path) (state-path-noscheme string start end))
202  (multiple-value-setq (i path) (state-path-empty string start end)))
203  then (values i nil nil nil path)))
204 
205 (defun state-scheme (string start end &aux i scheme)
206  ;; rule 07: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
207  ;; values: i scheme
208  (when (looking-at *alpha-bitvector* string start end)
209  (if* (setq i (scan-forward string (1+ start) end *scheme-bitvector*))
210  then (setq scheme (xsubseq start i))
211  else ;; just the one char
212  (setq scheme (xsubseq start (setq i (1+ start)))))
213  (values i scheme)))
214 
215 (defun state-authority (string start end &aux i i2 userinfo host ipv6 zone-id
216  port)
217  ;; rule 08: authority = [ userinfo "@" ] host [ ":" port ]
218  ;; values: i userinfo host port
219  (cond
220  ((and (multiple-value-setq (i userinfo) (state-userinfo string start end))
221  (setq i (looking-at #\@ string i end))
222  (multiple-value-setq (i host ipv6 zone-id)
223  (state-host string i end)))
224  ;; Somewhat of a hack, but I don't want to change all the functions
225  ;; to expect even more multiple values:
226  (when ipv6 (setq host (list host ipv6 zone-id)))
227 
228  ;; have: userinfo "@" host
229  (if* (not (setq i2 (looking-at #\: string i end)))
230  then ;; done, return what we have
231  (values i userinfo host)
232  elseif (multiple-value-setq (i port) (state-port string i2 end))
233  then ;; found ":" and port
234  (values i userinfo host port)
235  else ;; found ":" and NO port
236  (values i2 userinfo host)))
237 
238  ;; no userinfo, check for host
239  ((multiple-value-setq (i host ipv6 zone-id) (state-host string start end))
240  ;; Somewhat of a hack, but I don't want to change all the functions
241  ;; to expect even more multiple values:
242  (when ipv6 (setq host (list host ipv6 zone-id)))
243 
244  (if* (not (setq i2 (looking-at #\: string i end)))
245  then (values i nil host)
246  elseif (multiple-value-setq (i port) (state-port string i2 end))
247  then (values i nil host port)
248  else ;; found ":" and NO port
249  (values i2 nil host)))))
250 
251 (defun state-userinfo (string start end &aux i)
252  ;; rule 09: userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
253  ;;
254  ;; This one is more difficult, due to the alternation with
255  ;; pct-encoded:
256  ;; *( unreserved / pct-encoded / sub-delims / ":" )
257  ;; All the others are just characters, but pct-encoded is a
258  ;; specific sequence of characters.
259  (when (setq i (scan-forward string start end *userinfo-bitvector*
260  #'scan-pct-encoded))
261  (values i (xsubseq start i))))
262 
263 (defun state-port (string start end &aux i)
264  ;; rule 11: port = *DIGIT
265  (when (setq i (scan-forward string start end *digit-bitvector*))
266  (values i (xsubseq start i))))
267 
268 (defun state-host (string start end &aux i host ipv6 zone-id)
269  ;; rule 10: host = IP-literal / IPv4address / reg-name
270  ;; values: i host ipv6 zone-id
271  (if* (multiple-value-setq (i ipv6 zone-id)
272  (state-ip-literal string start end))
273  then (values i nil ipv6 zone-id)
274  elseif (or
275  (multiple-value-setq (i host) (state-ipv4address string start end))
276  (multiple-value-setq (i host) (state-reg-name string start end)))
277  then (values i host)))
278 
279 (defun state-ip-literal (string start end &aux ip-start i2 end-ip ip zone-id)
280  ;; rule 12a: IP-literal = "[" ( IPv6addrz / IPvFuture ) "]"
281  ;; values: i ipaddr zone-id
282  ;; NOTE: the [ and ] are not returned as part of the host.
283  (when (and (setq ip-start (looking-at #\[ string start end))
284  (or (multiple-value-setq (end-ip ip zone-id)
285  (state-ipv6addrz string ip-start end))
286  (multiple-value-setq (end-ip ip zone-id)
287  (state-ipvfuture string ip-start end)))
288  (setq i2 (looking-at #\] string end-ip end)))
289  (values i2 ip zone-id)))
290 
291 (defun state-ipv6addrz (string start end &aux ip-end zone-start zone-end)
292  ;; rule 12b: IPv6addrz = IPv6address [ "%25" ZoneID ]
293  ;; values: i ipaddr zone-id
294  (when (setq ip-end (scan-ipv6address string start end))
295  (if* (and (setq zone-start (looking-at "%25" string ip-end end))
296  (setq zone-end (scan-zone-id string zone-start end)))
297  then (values zone-end
298  (xsubseq start ip-end)
299  (xsubseq zone-start zone-end))
300  else (values ip-end (xsubseq start ip-end)))))
301 
302 (defun scan-zone-id (string start end)
303  ;; rule 12c: ZoneID = 1*( unreserved / pct-encoded )
304  (scan-forward string start end *unreserved-bitvector* #'scan-pct-encoded))
305 
306 (defun state-ipvfuture (string start end &aux i)
307  ;; rule 13:
308  ;; IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
309  ;; values: i ipvfuture
310  (when (and (setq i (looking-at #\v string start end))
311  (setq i (scan-forward string i end *hexdig-bitvector*))
312  (setq i (looking-at #\. string i end))
313  (setq i (scan-forward string i end *ipvfuture-bitvector*)))
314  (values i (xsubseq start i))))
315 
316 (defun scan-ipv6address (string start end &aux (i start))
317  ;; rule 14:
318  ;; IPv6address = 6( h16 ":" ) ls32 [1]
319  ;; / "::" 5( h16 ":" ) ls32 [2]
320  ;; / [ h16 ] "::" 4( h16 ":" ) ls32 [3]
321  ;; / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 [4]
322  ;; / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 [5]
323  ;; / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 [6]
324  ;; / [ *4( h16 ":" ) h16 ] "::" ls32 [7]
325  ;; / [ *5( h16 ":" ) h16 ] "::" h16 [8]
326  ;; / [ *6( h16 ":" ) h16 ] "::" [9]
327  ;; / "::" [10]
328  (or
329  (and (setq i (scan-h16-colon-pairs string start end 6 6)) ;; [1]
330  (setq i (scan-ls32 string i end)))
331  (and (setq i (looking-at "::" string start end)) ;; [2]
332  (setq i (scan-h16-colon-pairs string i end 5 5))
333  (setq i (scan-ls32 string i end)))
334  (and (setq i (scan-h16 string start end)) ;; [3]
335  (setq i (looking-at "::" string i end))
336  (setq i (scan-h16-colon-pairs string i end 4 4))
337  (setq i (scan-ls32 string i end)))
338  (setq i (scan-ipv6address-part4 string start end)) ;; [4]
339  (setq i (scan-ipv6address-part5 string start end)) ;; [5]
340  (setq i (scan-ipv6address-part6 string start end)) ;; [6]
341  (setq i (scan-ipv6address-part7 string start end)) ;; [7]
342  (setq i (scan-ipv6address-part8 string start end)) ;; [8]
343  (and (setq i (scan-h16-colon-pairs string start end 0 6)) ;; [9]
344  (setq i (scan-h16 string i end))
345  (setq i (looking-at "::" string i end)))
346  (setq i (looking-at "::" string start end)) ;; [10]
347  ))
348 
349 (defun scan-ipv6address-part4 (string start end &aux i)
350  ;; rule: [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
351  (or (and (setq i (looking-at "::" string start end))
352  (setq i (scan-h16-colon-pairs string i end 3 3))
353  (setq i (scan-ls32 string i end)))
354 
355  (and (setq i (scan-h16-colon-pairs string start end 0 1))
356  (setq i (scan-h16 string i end))
357  (setq i (looking-at "::" string i end))
358  (setq i (scan-h16-colon-pairs string i end 3 3))
359  (setq i (scan-ls32 string i end)))))
360 
361 (defun scan-ipv6address-part5 (string start end &aux i)
362  ;; rule: [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
363  (or (and (setq i (looking-at "::" string start end))
364  (setq i (scan-h16-colon-pairs string i end 2 2))
365  (setq i (scan-ls32 string i end)))
366 
367  (and (setq i (scan-h16-colon-pairs string start end 0 2))
368  (setq i (scan-h16 string i end))
369  (setq i (looking-at "::" string i end))
370  (setq i (scan-h16-colon-pairs string i end 2 2))
371  (setq i (scan-ls32 string i end)))))
372 
373 (defun scan-ipv6address-part6 (string start end &aux i)
374  ;; rule: [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
375  (or (and (setq i (looking-at "::" string start end))
376  (setq i (scan-h16 string i end))
377  (setq i (looking-at #\: string i end))
378  (setq i (scan-ls32 string i end)))
379  (and (setq i (scan-h16-colon-pairs string start end 0 3))
380  (setq i (scan-h16 string i end))
381  (setq i (looking-at "::" string i end))
382  (setq i (scan-h16 string i end))
383  (setq i (looking-at #\: string i end))
384  (setq i (scan-ls32 string i end)))))
385 
386 (defun scan-ipv6address-part7 (string start end &aux i)
387  ;; rule: [ *4( h16 ":" ) h16 ] "::" ls32
388  (or (and (setq i (looking-at "::" string start end))
389  (setq i (scan-ls32 string i end)))
390  (and (setq i (scan-h16-colon-pairs string start end 0 4))
391  (setq i (scan-h16 string i end))
392  (setq i (looking-at "::" string i end))
393  (setq i (scan-ls32 string i end)))))
394 
395 (defun scan-ipv6address-part8 (string start end &aux i)
396  ;; rule: [ *5( h16 ":" ) h16 ] "::" h16
397  (or (and (setq i (looking-at "::" string start end))
398  (setq i (scan-h16 string i end)))
399  (and (setq i (scan-h16-colon-pairs string start end 0 5))
400  (setq i (scan-h16 string i end))
401  (setq i (looking-at "::" string i end))
402  (setq i (scan-h16 string i end)))))
403 
404 (defun scan-h16-colon-pairs (string start end min max
405  &aux (i start)
406  i2
407  (nfound 0))
408  ;; subrule: min*max( h16 ":" )
409  ;; Scan from min to max pairs of: h16 + ":"
410  ;; NOTE: this function needs to lookahead to make sure there isn't a ::
411  ;; after the h16.
412  (loop while (and (< nfound max)
413  (setq i2 (scan-h16 string i end))
414  (setq i2 (looking-at #\: string i2 end))
415  (< i2 end)
416  (not (looking-at #\: string i2 end)))
417  do
418  (setq i i2)
419  (incf nfound))
420  (when (<= min nfound max)
421  i))
422 
423 (defun scan-h16 (string start end &aux i)
424  ;; rule 15: h16 = 1*4HEXDIG
425  (when (null start) (error "start is null"))
426  (when (and (setq i
427  (scan-forward string start
428  ;; only look 5 ahead
429  (min end (+ start 5))
430  *hexdig-bitvector*))
431  (<= 1 (the fixnum (- i start)) 4))
432  i))
433 
434 (defun scan-ls32 (string start end &aux i)
435  ;; rule 16: ls32 = ( h16 ":" h16 ) / IPv4address
436  (if* (and (setq i (scan-h16 string start end))
437  (setq i (looking-at #\: string i end))
438  (setq i (scan-h16 string i end)))
439  then i
440  else (scan-ipv4address string start end)))
441 
442 (defun scan-ipv4address (string start end &aux i)
443  ;; rule 17:
444  ;; IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
445  ;; values: i
446  (and (setq i (scan-dec-octet string start end))
447  (setq i (looking-at #\. string i end))
448  (setq i (scan-dec-octet string i end))
449  (setq i (looking-at #\. string i end))
450  (setq i (scan-dec-octet string i end))
451  (setq i (looking-at #\. string i end))
452  (scan-dec-octet string i end)))
453 
454 (defun state-ipv4address (string start end &aux i)
455  ;; values: i ipv4
456  (when (setq i (scan-ipv4address string start end))
457  (values i (xsubseq start i))))
458 
459 (defun scan-dec-octet (string start end &aux i)
460  ;; rule 18:
461  ;; dec-octet = DIGIT ; 0-9
462  ;; / %x31-39 DIGIT ; 10-99
463  ;; / "1" 2DIGIT ; 100-199
464  ;; / "2" %x30-34 DIGIT ; 200-249
465  ;; / "25" %x30-35 ; 250-255
466  ;; Honestly, the above makes little sense to me. The truth is,
467  ;; "http://256.0.0.1/" is a valid URI because even though it doesn't
468  ;; parse as a dec-octet, it does parse as a reg-name (rule 19).
469  (when (and (setq i (scan-forward string start end *digit-bitvector*))
470  (<= 1 (- i start) 3))
471  i))
472 
473 (defun state-reg-name (string start end &aux i)
474  ;; rule 19: reg-name = *( unreserved / pct-encoded / sub-delims )
475  ;; values: i host
476  (when (setq i (scan-forward string start end *reg-name-bitvector*
477  #'scan-pct-encoded))
478  (values i (xsubseq start i))))
479 
480 (defun state-path-abempty (string start end &aux i i2)
481  ;; rule 21: path-abempty = *( "/" *pchar )
482  ;; values: i path
483  ;; NOTE: if *strict-parse* is nil, we allow the leading "/" to be "//",
484  ;; because it is a common typo in HTML and sometimes fixing it is
485  ;; not under our control. Browsers work fine with this
486  ;; non-conformance.
487  (when (and (not *strict-parse*)
488  (looking-at "//" string start end))
489  ;; double leading slash is changed to a single leading slash.
490  (incf start))
491  (setq i start)
492  (loop
493  (setq i2 nil)
494  (if* (looking-at #\/ string i end)
495  then (if* (setq i2 (scan-pchar string (1+ i) end))
496  then (setq i i2)
497  else (incf i) ;; advance for the / we found
498  (return))
499  else (return)))
500  (when (> i start)
501  (values i (xsubseq start i))))
502 
503 (defun state-path-absolute (string start end &aux (i start) i2 have-slash)
504  ;; rule 22: path-absolute = "/" [ 1*pchar *( "/" *pchar ) ]
505  ;; remember: [ foo ] means 0*1( foo )
506  ;; values: i path
507  (when (setq i (looking-at #\/ string i end))
508  (when (setq i2 (scan-pchar string i end))
509  ;; parse is good to here
510  (setq i i2
511  i2 nil)
512  ;; Now, look for *( "/" *pchar )
513  (loop while (and (setq have-slash (looking-at #\/ string i end))
514  (setq i2 (scan-pchar string have-slash end)))
515  do (setq i i2))
516  ;; If it ends with a /:
517  (when (and have-slash (not i2)) (incf i)))
518  (values i (xsubseq start i))))
519 
520 (defun state-path-noscheme (string start end &aux (i start) i2 have-slash)
521  ;; rule 23: path-noscheme = segment-nz-nc *( "/" *pchar )
522  ;; values: i path
523  (when (setq i (scan-segment-nz-nc string i end))
524  (loop while (and (setq have-slash (looking-at #\/ string i end))
525  (setq i2 (scan-pchar string (1+ i) end)))
526  do (setq i i2))
527  (when (and have-slash (not i2))
528  ;; for the slash we did see:
529  (incf i))
530  (values i (xsubseq start i))))
531 
532 (defun state-path-rootless (string start end &aux (i start) i2)
533  ;; rule 24: path-rootless = 1*pchar *( "/" *pchar )
534  ;; values: i path
535  (when (setq i (scan-pchar string i end))
536  (loop while (and (looking-at #\/ string i end)
537  ;; The pchar after the slash is optional
538  (setq i2 (or (scan-pchar string (1+ i) end)
539  (1+ i))))
540  do (setq i i2))
541  (values i (xsubseq start i))))
542 
543 (defun state-path-empty (string start end)
544  ;; rule 25: path-empty = 0<pchar>
545  ;; values: i path
546  ;; NOTE: the RHS was updated in RFC 3986 errata to be "", but that is
547  ;; bogus. "" is very different the 0<pchar>.
548  ;; Return nil when looking at a `pchar' and the null marker otherwise.
549  (declare (optimize (safety 0)))
550  (if* (looking-at *pchar-bitvector* string start end)
551  then nil
552  else (values start #.*uri-null-marker*)))
553 
554 (defun scan-segment-nz-nc (string start end)
555  ;; rule 28: 1*( unreserved / pct-encoded / sub-delims / "@" )
556  ;; In english: pchar without #\:
557  (declare (optimize (safety 0)))
558  (scan-forward string start end *segment-nz-nc-bitvector* #'scan-pct-encoded))
559 
560 (defun scan-pchar (string start end)
561  ;; rule 29: pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
562  (declare (optimize (safety 0)))
563  (scan-forward string start end *pchar-bitvector* #'scan-pct-encoded))
564 
565 (defun state-query (string start end &aux i)
566  ;; rule 30: *( pchar / "/" / "?" )
567  ;; values: i query
568  (when (setq i
569  (scan-forward string start end
570  (if* *strict-parse*
571  then *query-bitvector-strict*
572  else *query-bitvector-non-strict*)
573  #'scan-pct-encoded))
574  (values i (xsubseq start i))))
575 
576 (defun state-fragment (string start end &aux i)
577  ;; rule 31: *( pchar / "/" / "?" / "#" )
578  ;; NOTE: Allegro CL added "#" in non-strict mode
579  ;; values: i fragment
580  (when (setq i
581  (scan-forward string start end
582  (if* *strict-parse*
583  then *fragment-bitvector-strict*
584  else *fragment-bitvector-non-strict*)
585  #'scan-pct-encoded))
586  (values i (xsubseq start i))))
587 
588 (defvar .pct-encoded. nil)
589 
590 (defun scan-pct-encoded (string start end)
591  ;; This scans a single percent encoded sequence. It does no conversion.
592  ;; It also sets .pct-encoded., which is a boolean that says "this string
593  ;; has some percent encoded characters in it."
594  ;;
595  ;; rule 32: pct-encoded = "%" HEXDIG HEXDIG
596  (declare (fixnum start end))
597  (and (> (the fixnum (- end start)) 2) ;; ... at least 3 chars remaining
598  (looking-at #\% string start end)
599  (looking-at *hexdig-bitvector* string (incf start) end)
600  (looking-at *hexdig-bitvector* string (incf start) end)
601  (setq .pct-encoded. start)))
602 
603 (defun state-uri-file (string start end &aux i)
604  ;; rule: uri-file = "//" <anything>
605  ;; --TERMINAL--
606  ;; values: i path
607  ;; It's not the job of the URI parser to validate file:// URIs.
608  (when (setq i (looking-at "//" string start end))
609  (values i (xsubseq i end))))
610 
611 (defun state-urn-namestring (string start end
612  &aux (i start) i2 nid nss q-component f-component
613  r-component)
614  ;; rule 50: namestring = assigned-name
615  ;; [ rq-components ]
616  ;; [ "#" f-component ]
617  ;; rule 58: f-component = fragment
618  ;; START is just after "urn:".
619  ;; values: i nid nss q-component f-component r-component
620  (when (multiple-value-setq (i2 nid nss)
621  (state-urn-assigned-name string start end))
622  (when (at-end-p i2 end)
623  (return-from state-urn-namestring (values i2 nid nss)))
624 
625  (setq i i2)
626  (when (multiple-value-setq (i2 r-component q-component)
627  (state-urn-rq-components string i end))
628  (when (at-end-p i2 end)
629  (return-from state-urn-namestring
630  (values i2 nid nss q-component nil r-component)))
631  (setq i i2)
632  ;; more STRING to process...
633 
634  (when (looking-at #\# string i end)
635  (if* (multiple-value-setq (i2 f-component)
636  ;; Yes, the same fragment (RFC 8141 defines f-component in
637  ;; terms of RFC 3986's fragment).
638  (state-fragment string (incf i) end))
639  then (setq i i2)
640  else (setq f-component #.*uri-null-marker*)))
641 
642  (when (at-end-p i end)
643  (values i2 nid nss q-component f-component r-component)))))
644 
645 (defun state-urn-assigned-name (string start end &aux i i2 nid nss)
646  ;; rule 51: assigned-name = "urn" ":" NID ":" NSS
647  ;; START is just after "urn:".
648  ;; values: i nid nss
649  (when (and (multiple-value-setq (i2 nid) (state-urn-nid string start end))
650  (looking-at #\: string i2 end)
651  (setq i (1+ i2))
652  (multiple-value-setq (i2 nss) (state-urn-nss string i end)))
653  (values i2 nid nss)))
654 
655 (defun state-urn-nid (string start end &aux (i start))
656  ;; rule 52: NID = (alphanum) 0*30(ldh) (alphanum)
657  ;; rule 53: ldh = alphanum / "-"
658  ;; values: i nid
659  (declare (fixnum start end i))
660  (when (and (looking-at *alphanum-bitvector* string i end)
661  (setq i (scan-forward string (1+ i) end *alphanum+-bitvector*))
662  ;; Check for <= 32 chars, thus far
663  (<= (the fixnum (- i start))
664  32)
665  ;; If the last one was alphanum, then we're done.
666  ;; If the last one was NOT alphanum, then:
667  ;; 1. make sure we had 30 chars (not 31)
668  ;; 2. look for another, single alphanum
669  (or (looking-at *alphanum-bitvector* string (1- i) end)
670  (and (<= (the fixnum (- i start))
671  31)
672  (not (at-end-p i end))
673  (setq i
674  (scan-forward string i end *alphanum-bitvector*)))))
675  (values i (xsubseq start i))))
676 
677 (defun state-urn-nss (string start end &aux i i2)
678  ;; rule 54: NSS = pchar *(pchar / "/")
679  ;; values: i nss
680  (when (setq i (scan-pchar string start (1+ start)))
681  (if* (setq i2 (scan-forward
682  string i end
683  ;; See the definition of *urn-nss-chars* for
684  ;; why we don't use *pchar/-bitvector* here.
685  *urn-nss-bitvector*
686  #'scan-pct-encoded))
687  then (values i2 (xsubseq start i2))
688  else (values i (xsubseq start i)))))
689 
690 (defun state-urn-rq-components (string start end
691  &aux i ri qi r-component q-component)
692  ;; rule 55: rq-components = [ "?+" r-component ]
693  ;; [ "?=" q-component ]
694  ;; values: i r-component q-component
695  (when (and (setq i (looking-at #\? string start end))
696  (not (at-end-p i end))
697  (or (setq ri (looking-at #\+ string i end))
698  (setq qi (looking-at #\= string i end)))
699  (not (at-end-p (or ri qi) end)))
700  (when (and ri (multiple-value-setq (i r-component)
701  (state-urn-r-component string ri end)))
702  (when (at-end-p i end)
703  (return-from state-urn-rq-components
704  (values i r-component)))
705 
706  (if* (setq qi (looking-at #\? string i end))
707  then (when (and (not (at-end-p qi end))
708  (setq qi (looking-at #\= string qi end))
709  (not (at-end-p qi end)))
710  (when (multiple-value-setq (i q-component)
711  (state-urn-q-component string qi end))
712  (return-from state-urn-rq-components
713  (values i r-component q-component))))
714  else (return-from state-urn-rq-components (values i r-component))))
715  ;; The r-component branch didn't yield anything, check for q-component
716 
717  (when (and qi (multiple-value-setq (i q-component)
718  (state-urn-q-component string qi end)))
719  (return-from state-urn-rq-components
720  (values i nil q-component)))))
721 
722 (defun scan-q-component-or-pct-encoded (string i end &aux i2)
723  ;; Do what scan-pct-encoded does, BUT STOP scanning if we see "?=",
724  ;; because that is the start of the q-component.
725  ;;
726  ;; This function is called by SCAN-FORWARD at each character position in
727  ;; STRING.
728 
729  (when (setq i2 (scan-pct-encoded string i end))
730  (return-from scan-q-component-or-pct-encoded i2))
731 
732  (when (setq i2 (looking-at #\? string i end))
733  (if* (and
734  ;; at least 2 chars remaining (for 1 char after ?=)
735  (> (- end i2) 1)
736  (looking-at #\= string i2 end))
737  then ;; stop scanning
738  (return-from scan-q-component-or-pct-encoded nil)
739  else ;; return the index after the ?
740  (return-from scan-q-component-or-pct-encoded i2))))
741 
742 (defun state-urn-r-component (string start end &aux i i2)
743  ;; rule 56: r-component = pchar *( pchar / "/" / "?" )
744  ;; values: i r-component
745  (when (setq i (scan-pchar string start end))
746  (when (at-end-p i end)
747  (return-from state-urn-r-component
748  (values i (xsubseq start i))))
749  (cond
750  ((setq i2
751  (scan-forward
752  string i end
753  ;; NOTE: we don't use *query-bitvector-strict* because we need
754  ;; to handle #\? specially (see the next argument).
755  *urn-query-bitvector*
756  ;; NOTE: Because r-component can contain "?" without percent
757  ;; encoding, when processing the r-component we need to
758  ;; look ahead to make sure there is no #\= after each
759  ;; #\? (since that means we have a q-component).
760  #'scan-q-component-or-pct-encoded))
761  (values i2 (xsubseq start i2)))
762 
763  ;; We immediately ran into ?=, so return what we found so far:
764  (t (values i (xsubseq start i))))))
765 
766 (defun state-urn-q-component (string start end &aux i)
767  ;; rule 57: q-component = pchar *( pchar / "/" / "?" )
768  ;; values: i q-component
769  (when (setq i (looking-at *pchar-bitvector* string start end))
770  (when (at-end-p i end)
771  (return-from state-urn-q-component
772  (values i (xsubseq start i))))
773  (when (setq i
774  (scan-forward string i end *query-bitvector-strict*
775  #'scan-pct-encoded))
776  (values i (xsubseq start i)))))