Mercurial > core / lisp/lib/obj/uri/state.lisp
changeset 698: |
96958d3eb5b0 |
parent: |
78ef6145e272
|
author: |
Richard Westhaver <ellis@rwest.io> |
date: |
Fri, 04 Oct 2024 22:04:59 -0400 |
permissions: |
-rw-r--r-- |
description: |
fixes |
1 ;;; obj/uri/state.lisp --- Parser state 6 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 7 ;; A note about parser naming conventions. 8 ;; There are two types of functions, where <name> comes from the LHS 9 ;; of the ABNF grammar: 10 ;; state-<name> :: scan and return values based on the parse. The 11 ;; first value is always the "next" index beyond the parse. 12 ;; The subsequent values are rule specific, and documented in 13 ;; the functions themselves. 14 ;; scan-<name> :: scan for and return either nil or an index. If 15 ;; there is match, return the "next" index beyond the match, 18 ;; Rules marked `TERMINAL' must check for `at-end-p', since they must 19 ;; terminate the parse for the input to be valid. 20 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 23 (defun state-uri (string start end 24 &aux i scheme userinfo host port path query fragment 25 nid nss q-component f-component r-component i2 26 colon urn-scheme file-scheme) 27 ;; rule 01: URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] 29 ;; values: i scheme userinfo host port path query fragment 30 (if* (and (multiple-value-setq (i scheme) (state-scheme string start end)) 31 (setq colon (looking-at #\: string i end)) 32 (not (setq urn-scheme (looking-at "urn" string start end t))) 33 (not (setq file-scheme (looking-at "file" string start end t))) 34 (multiple-value-setq (i2 userinfo host port path) 35 (state-hier-part string (1+ i) end))) 36 then ;; Have hier-part... 38 (when (at-end-p i end) 39 (return-from state-uri 40 (values i scheme userinfo host port path))) 42 (when (looking-at #\? string i end) 43 (if* (multiple-value-setq (i2 query) 44 (state-query string (incf i) end)) 46 else (setq query #.*uri-null-marker*))) 48 (when (looking-at #\# string i end) 49 (if* (multiple-value-setq (i2 fragment) 50 (state-fragment string (incf i) end)) 52 else (setq fragment #.*uri-null-marker*))) 54 (when (at-end-p i end) 55 (values i scheme userinfo host port path query fragment)) 57 then ;; values: i "urn" nid r-component nil nss q-component f-component 58 (when (multiple-value-setq (i nid nss q-component f-component 60 (state-urn-namestring string i end)) 70 elseif (and file-scheme 71 (multiple-value-setq (i path) 72 (state-uri-file string colon end))) 73 then (values i scheme nil nil nil path) 74 elseif (and scheme colon) 75 then ;; Something like "mailto:foo@bar.com". Put the 76 ;; the non-scheme part into the path 77 (values end scheme nil nil nil (xsubseq colon end)))) 79 ;; called by parse-uri-string-rfc3986 80 (defun state-uri-reference (string start end 81 &aux i scheme userinfo host port path query 83 ;; rule 02: URI-reference = URI / relative-ref 84 ;; values: i scheme host userinfo port path query fragment 85 (if* (multiple-value-setq (i scheme userinfo host port path query 87 (state-uri string start end)) 88 then (values i scheme userinfo host port path query fragment) 89 elseif (multiple-value-setq (i userinfo host port path query fragment) 90 (state-relative-ref string start end)) 91 then (values i nil userinfo host port path query fragment))) 93 ;; called by parse-uri-string-rfc3986 94 (defun state-absolute-uri (string start end 95 &aux i scheme userinfo host port path query i2 96 colon urn-scheme file-scheme) 97 ;; rule 03: absolute-URI = scheme ":" hier-part [ "?" query ] 99 ;; values: i scheme userinfo host port path query 100 (if* (and (multiple-value-setq (i scheme) (state-scheme string start end)) 101 (setq colon (looking-at #\: string i end)) 102 (not (setq urn-scheme (looking-at "urn" string start end t))) 103 (not (setq file-scheme (looking-at "file" string start end t))) 104 (multiple-value-setq (i2 userinfo host port path) 105 (state-hier-part string colon end))) 106 then ;; so far: scheme + ":" + hier-part 108 (if* (at-end-p i end) 109 then (values i scheme userinfo host port path) 110 elseif (and (looking-at #\? string i end) 111 (multiple-value-setq (i query) 112 (state-query string (incf i) end)) 114 then (values i scheme userinfo host port path query)) 116 then ;; values: i "urn" nid r-component nil nss q-component f-component 117 (multiple-value-bind (i3 nid nss q-component f-component r-component) 118 (state-urn-namestring string (incf i) end) 122 r-component ;userinfo 127 f-component ;fragment 129 elseif (and file-scheme 130 (multiple-value-setq (i path) 131 (state-uri-file string colon end))) 132 then (values i scheme nil nil nil path) 133 elseif (and scheme colon) 134 then ;; Something like "mailto:foo@bar.com". Put the 135 ;; the non-scheme part into the path 136 (values end scheme nil nil nil (xsubseq colon end)))) 138 (defun state-hier-part (string start end &aux i userinfo host port 140 ;; rule 04: hier-part = "//" authority path-abempty 141 ;; / "//" path-absolute ***NEW*** 145 ;; values: i userinfo host port path 146 (if* (and (setq i (looking-at "//" string start end)) 147 (multiple-value-setq (i userinfo host port) 148 (state-authority string i end))) 149 then (if* (multiple-value-setq (i2 path) (state-path-abempty string i end)) 150 then (values i2 userinfo host port path) 151 else (values i userinfo host port)) 152 elseif (and (setq i (looking-at "//" string start end)) 153 (multiple-value-setq (i path) 154 (state-path-absolute string i end))) 155 then (values i nil nil nil path) 157 (multiple-value-setq (i path) (state-path-absolute string start end)) 158 (multiple-value-setq (i path) (state-path-rootless string start end)) 159 (multiple-value-setq (i path) (state-path-empty string start end))) 160 then (values i nil nil nil path))) 162 (defun state-relative-ref (string start end &aux i2 query fragment) 163 ;; rule 05: relative-ref = relative-part [ "?" query ] [ "#" fragment ] 165 ;; values: i userinfo host port path query fragment 166 (multiple-value-bind (i userinfo host port path) 167 (state-relative-part string start end) 169 (if* (at-end-p i end) 170 then (values i userinfo host port path) 171 else (when (looking-at #\? string i end) 172 (if* (multiple-value-setq (i2 query) 173 (state-query string (incf i) end)) 175 else (setq query #.*uri-null-marker*))) 177 (when (looking-at #\# string i end) 178 (if* (multiple-value-setq (i2 fragment) 179 (state-fragment string (incf i) end)) 181 else (setq fragment #.*uri-null-marker*))) 183 (when (at-end-p i end) 184 (values i userinfo host port path query fragment)))))) 186 (defun state-relative-part (string start end 187 &aux (i start) path userinfo host port i2) 188 ;; rule 06: relative-part = "//" authority path-abempty 192 ;; values: i userinfo host port path 193 (if* (and (setq i (looking-at "//" string i end)) 194 (multiple-value-setq (i userinfo host port) 195 (state-authority string i end))) 196 then (if* (multiple-value-setq (i2 path) (state-path-abempty string i end)) 197 then (values i2 userinfo host port path) 198 else (values i userinfo host port)) 200 (multiple-value-setq (i path) (state-path-absolute string start end)) 201 (multiple-value-setq (i path) (state-path-noscheme string start end)) 202 (multiple-value-setq (i path) (state-path-empty string start end))) 203 then (values i nil nil nil path))) 205 (defun state-scheme (string start end &aux i scheme) 206 ;; rule 07: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) 208 (when (looking-at *alpha-bitvector* string start end) 209 (if* (setq i (scan-forward string (1+ start) end *scheme-bitvector*)) 210 then (setq scheme (xsubseq start i)) 211 else ;; just the one char 212 (setq scheme (xsubseq start (setq i (1+ start))))) 215 (defun state-authority (string start end &aux i i2 userinfo host ipv6 zone-id 217 ;; rule 08: authority = [ userinfo "@" ] host [ ":" port ] 218 ;; values: i userinfo host port 220 ((and (multiple-value-setq (i userinfo) (state-userinfo string start end)) 221 (setq i (looking-at #\@ string i end)) 222 (multiple-value-setq (i host ipv6 zone-id) 223 (state-host string i end))) 224 ;; Somewhat of a hack, but I don't want to change all the functions 225 ;; to expect even more multiple values: 226 (when ipv6 (setq host (list host ipv6 zone-id))) 228 ;; have: userinfo "@" host 229 (if* (not (setq i2 (looking-at #\: string i end))) 230 then ;; done, return what we have 231 (values i userinfo host) 232 elseif (multiple-value-setq (i port) (state-port string i2 end)) 233 then ;; found ":" and port 234 (values i userinfo host port) 235 else ;; found ":" and NO port 236 (values i2 userinfo host))) 238 ;; no userinfo, check for host 239 ((multiple-value-setq (i host ipv6 zone-id) (state-host string start end)) 240 ;; Somewhat of a hack, but I don't want to change all the functions 241 ;; to expect even more multiple values: 242 (when ipv6 (setq host (list host ipv6 zone-id))) 244 (if* (not (setq i2 (looking-at #\: string i end))) 245 then (values i nil host) 246 elseif (multiple-value-setq (i port) (state-port string i2 end)) 247 then (values i nil host port) 248 else ;; found ":" and NO port 249 (values i2 nil host))))) 251 (defun state-userinfo (string start end &aux i) 252 ;; rule 09: userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) 254 ;; This one is more difficult, due to the alternation with 256 ;; *( unreserved / pct-encoded / sub-delims / ":" ) 257 ;; All the others are just characters, but pct-encoded is a 258 ;; specific sequence of characters. 259 (when (setq i (scan-forward string start end *userinfo-bitvector* 261 (values i (xsubseq start i)))) 263 (defun state-port (string start end &aux i) 264 ;; rule 11: port = *DIGIT 265 (when (setq i (scan-forward string start end *digit-bitvector*)) 266 (values i (xsubseq start i)))) 268 (defun state-host (string start end &aux i host ipv6 zone-id) 269 ;; rule 10: host = IP-literal / IPv4address / reg-name 270 ;; values: i host ipv6 zone-id 271 (if* (multiple-value-setq (i ipv6 zone-id) 272 (state-ip-literal string start end)) 273 then (values i nil ipv6 zone-id) 275 (multiple-value-setq (i host) (state-ipv4address string start end)) 276 (multiple-value-setq (i host) (state-reg-name string start end))) 277 then (values i host))) 279 (defun state-ip-literal (string start end &aux ip-start i2 end-ip ip zone-id) 280 ;; rule 12a: IP-literal = "[" ( IPv6addrz / IPvFuture ) "]" 281 ;; values: i ipaddr zone-id 282 ;; NOTE: the [ and ] are not returned as part of the host. 283 (when (and (setq ip-start (looking-at #\[ string start end)) 284 (or (multiple-value-setq (end-ip ip zone-id) 285 (state-ipv6addrz string ip-start end)) 286 (multiple-value-setq (end-ip ip zone-id) 287 (state-ipvfuture string ip-start end))) 288 (setq i2 (looking-at #\] string end-ip end))) 289 (values i2 ip zone-id))) 291 (defun state-ipv6addrz (string start end &aux ip-end zone-start zone-end) 292 ;; rule 12b: IPv6addrz = IPv6address [ "%25" ZoneID ] 293 ;; values: i ipaddr zone-id 294 (when (setq ip-end (scan-ipv6address string start end)) 295 (if* (and (setq zone-start (looking-at "%25" string ip-end end)) 296 (setq zone-end (scan-zone-id string zone-start end))) 297 then (values zone-end 298 (xsubseq start ip-end) 299 (xsubseq zone-start zone-end)) 300 else (values ip-end (xsubseq start ip-end))))) 302 (defun scan-zone-id (string start end) 303 ;; rule 12c: ZoneID = 1*( unreserved / pct-encoded ) 304 (scan-forward string start end *unreserved-bitvector* #'scan-pct-encoded)) 306 (defun state-ipvfuture (string start end &aux i) 308 ;; IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) 309 ;; values: i ipvfuture 310 (when (and (setq i (looking-at #\v string start end)) 311 (setq i (scan-forward string i end *hexdig-bitvector*)) 312 (setq i (looking-at #\. string i end)) 313 (setq i (scan-forward string i end *ipvfuture-bitvector*))) 314 (values i (xsubseq start i)))) 316 (defun scan-ipv6address (string start end &aux (i start)) 318 ;; IPv6address = 6( h16 ":" ) ls32 [1] 319 ;; / "::" 5( h16 ":" ) ls32 [2] 320 ;; / [ h16 ] "::" 4( h16 ":" ) ls32 [3] 321 ;; / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 [4] 322 ;; / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 [5] 323 ;; / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 [6] 324 ;; / [ *4( h16 ":" ) h16 ] "::" ls32 [7] 325 ;; / [ *5( h16 ":" ) h16 ] "::" h16 [8] 326 ;; / [ *6( h16 ":" ) h16 ] "::" [9] 329 (and (setq i (scan-h16-colon-pairs string start end 6 6)) ;; [1] 330 (setq i (scan-ls32 string i end))) 331 (and (setq i (looking-at "::" string start end)) ;; [2] 332 (setq i (scan-h16-colon-pairs string i end 5 5)) 333 (setq i (scan-ls32 string i end))) 334 (and (setq i (scan-h16 string start end)) ;; [3] 335 (setq i (looking-at "::" string i end)) 336 (setq i (scan-h16-colon-pairs string i end 4 4)) 337 (setq i (scan-ls32 string i end))) 338 (setq i (scan-ipv6address-part4 string start end)) ;; [4] 339 (setq i (scan-ipv6address-part5 string start end)) ;; [5] 340 (setq i (scan-ipv6address-part6 string start end)) ;; [6] 341 (setq i (scan-ipv6address-part7 string start end)) ;; [7] 342 (setq i (scan-ipv6address-part8 string start end)) ;; [8] 343 (and (setq i (scan-h16-colon-pairs string start end 0 6)) ;; [9] 344 (setq i (scan-h16 string i end)) 345 (setq i (looking-at "::" string i end))) 346 (setq i (looking-at "::" string start end)) ;; [10] 349 (defun scan-ipv6address-part4 (string start end &aux i) 350 ;; rule: [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 351 (or (and (setq i (looking-at "::" string start end)) 352 (setq i (scan-h16-colon-pairs string i end 3 3)) 353 (setq i (scan-ls32 string i end))) 355 (and (setq i (scan-h16-colon-pairs string start end 0 1)) 356 (setq i (scan-h16 string i end)) 357 (setq i (looking-at "::" string i end)) 358 (setq i (scan-h16-colon-pairs string i end 3 3)) 359 (setq i (scan-ls32 string i end))))) 361 (defun scan-ipv6address-part5 (string start end &aux i) 362 ;; rule: [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 363 (or (and (setq i (looking-at "::" string start end)) 364 (setq i (scan-h16-colon-pairs string i end 2 2)) 365 (setq i (scan-ls32 string i end))) 367 (and (setq i (scan-h16-colon-pairs string start end 0 2)) 368 (setq i (scan-h16 string i end)) 369 (setq i (looking-at "::" string i end)) 370 (setq i (scan-h16-colon-pairs string i end 2 2)) 371 (setq i (scan-ls32 string i end))))) 373 (defun scan-ipv6address-part6 (string start end &aux i) 374 ;; rule: [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 375 (or (and (setq i (looking-at "::" string start end)) 376 (setq i (scan-h16 string i end)) 377 (setq i (looking-at #\: string i end)) 378 (setq i (scan-ls32 string i end))) 379 (and (setq i (scan-h16-colon-pairs string start end 0 3)) 380 (setq i (scan-h16 string i end)) 381 (setq i (looking-at "::" string i end)) 382 (setq i (scan-h16 string i end)) 383 (setq i (looking-at #\: string i end)) 384 (setq i (scan-ls32 string i end))))) 386 (defun scan-ipv6address-part7 (string start end &aux i) 387 ;; rule: [ *4( h16 ":" ) h16 ] "::" ls32 388 (or (and (setq i (looking-at "::" string start end)) 389 (setq i (scan-ls32 string i end))) 390 (and (setq i (scan-h16-colon-pairs string start end 0 4)) 391 (setq i (scan-h16 string i end)) 392 (setq i (looking-at "::" string i end)) 393 (setq i (scan-ls32 string i end))))) 395 (defun scan-ipv6address-part8 (string start end &aux i) 396 ;; rule: [ *5( h16 ":" ) h16 ] "::" h16 397 (or (and (setq i (looking-at "::" string start end)) 398 (setq i (scan-h16 string i end))) 399 (and (setq i (scan-h16-colon-pairs string start end 0 5)) 400 (setq i (scan-h16 string i end)) 401 (setq i (looking-at "::" string i end)) 402 (setq i (scan-h16 string i end))))) 404 (defun scan-h16-colon-pairs (string start end min max 408 ;; subrule: min*max( h16 ":" ) 409 ;; Scan from min to max pairs of: h16 + ":" 410 ;; NOTE: this function needs to lookahead to make sure there isn't a :: 412 (loop while (and (< nfound max) 413 (setq i2 (scan-h16 string i end)) 414 (setq i2 (looking-at #\: string i2 end)) 416 (not (looking-at #\: string i2 end))) 420 (when (<= min nfound max) 423 (defun scan-h16 (string start end &aux i) 424 ;; rule 15: h16 = 1*4HEXDIG 425 (when (null start) (error "start is null")) 427 (scan-forward string start 429 (min end (+ start 5)) 431 (<= 1 (the fixnum (- i start)) 4)) 434 (defun scan-ls32 (string start end &aux i) 435 ;; rule 16: ls32 = ( h16 ":" h16 ) / IPv4address 436 (if* (and (setq i (scan-h16 string start end)) 437 (setq i (looking-at #\: string i end)) 438 (setq i (scan-h16 string i end))) 440 else (scan-ipv4address string start end))) 442 (defun scan-ipv4address (string start end &aux i) 444 ;; IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet 446 (and (setq i (scan-dec-octet string start end)) 447 (setq i (looking-at #\. string i end)) 448 (setq i (scan-dec-octet string i end)) 449 (setq i (looking-at #\. string i end)) 450 (setq i (scan-dec-octet string i end)) 451 (setq i (looking-at #\. string i end)) 452 (scan-dec-octet string i end))) 454 (defun state-ipv4address (string start end &aux i) 456 (when (setq i (scan-ipv4address string start end)) 457 (values i (xsubseq start i)))) 459 (defun scan-dec-octet (string start end &aux i) 461 ;; dec-octet = DIGIT ; 0-9 462 ;; / %x31-39 DIGIT ; 10-99 463 ;; / "1" 2DIGIT ; 100-199 464 ;; / "2" %x30-34 DIGIT ; 200-249 465 ;; / "25" %x30-35 ; 250-255 466 ;; Honestly, the above makes little sense to me. The truth is, 467 ;; "http://256.0.0.1/" is a valid URI because even though it doesn't 468 ;; parse as a dec-octet, it does parse as a reg-name (rule 19). 469 (when (and (setq i (scan-forward string start end *digit-bitvector*)) 470 (<= 1 (- i start) 3)) 473 (defun state-reg-name (string start end &aux i) 474 ;; rule 19: reg-name = *( unreserved / pct-encoded / sub-delims ) 476 (when (setq i (scan-forward string start end *reg-name-bitvector* 478 (values i (xsubseq start i)))) 480 (defun state-path-abempty (string start end &aux i i2) 481 ;; rule 21: path-abempty = *( "/" *pchar ) 483 ;; NOTE: if *strict-parse* is nil, we allow the leading "/" to be "//", 484 ;; because it is a common typo in HTML and sometimes fixing it is 485 ;; not under our control. Browsers work fine with this 487 (when (and (not *strict-parse*) 488 (looking-at "//" string start end)) 489 ;; double leading slash is changed to a single leading slash. 494 (if* (looking-at #\/ string i end) 495 then (if* (setq i2 (scan-pchar string (1+ i) end)) 497 else (incf i) ;; advance for the / we found 501 (values i (xsubseq start i)))) 503 (defun state-path-absolute (string start end &aux (i start) i2 have-slash) 504 ;; rule 22: path-absolute = "/" [ 1*pchar *( "/" *pchar ) ] 505 ;; remember: [ foo ] means 0*1( foo ) 507 (when (setq i (looking-at #\/ string i end)) 508 (when (setq i2 (scan-pchar string i end)) 509 ;; parse is good to here 512 ;; Now, look for *( "/" *pchar ) 513 (loop while (and (setq have-slash (looking-at #\/ string i end)) 514 (setq i2 (scan-pchar string have-slash end))) 516 ;; If it ends with a /: 517 (when (and have-slash (not i2)) (incf i))) 518 (values i (xsubseq start i)))) 520 (defun state-path-noscheme (string start end &aux (i start) i2 have-slash) 521 ;; rule 23: path-noscheme = segment-nz-nc *( "/" *pchar ) 523 (when (setq i (scan-segment-nz-nc string i end)) 524 (loop while (and (setq have-slash (looking-at #\/ string i end)) 525 (setq i2 (scan-pchar string (1+ i) end))) 527 (when (and have-slash (not i2)) 528 ;; for the slash we did see: 530 (values i (xsubseq start i)))) 532 (defun state-path-rootless (string start end &aux (i start) i2) 533 ;; rule 24: path-rootless = 1*pchar *( "/" *pchar ) 535 (when (setq i (scan-pchar string i end)) 536 (loop while (and (looking-at #\/ string i end) 537 ;; The pchar after the slash is optional 538 (setq i2 (or (scan-pchar string (1+ i) end) 541 (values i (xsubseq start i)))) 543 (defun state-path-empty (string start end) 544 ;; rule 25: path-empty = 0<pchar> 546 ;; NOTE: the RHS was updated in RFC 3986 errata to be "", but that is 547 ;; bogus. "" is very different the 0<pchar>. 548 ;; Return nil when looking at a `pchar' and the null marker otherwise. 549 (declare (optimize (safety 0))) 550 (if* (looking-at *pchar-bitvector* string start end) 552 else (values start #.*uri-null-marker*))) 554 (defun scan-segment-nz-nc (string start end) 555 ;; rule 28: 1*( unreserved / pct-encoded / sub-delims / "@" ) 556 ;; In english: pchar without #\: 557 (declare (optimize (safety 0))) 558 (scan-forward string start end *segment-nz-nc-bitvector* #'scan-pct-encoded)) 560 (defun scan-pchar (string start end) 561 ;; rule 29: pchar = unreserved / pct-encoded / sub-delims / ":" / "@" 562 (declare (optimize (safety 0))) 563 (scan-forward string start end *pchar-bitvector* #'scan-pct-encoded)) 565 (defun state-query (string start end &aux i) 566 ;; rule 30: *( pchar / "/" / "?" ) 569 (scan-forward string start end 571 then *query-bitvector-strict* 572 else *query-bitvector-non-strict*) 574 (values i (xsubseq start i)))) 576 (defun state-fragment (string start end &aux i) 577 ;; rule 31: *( pchar / "/" / "?" / "#" ) 578 ;; NOTE: Allegro CL added "#" in non-strict mode 579 ;; values: i fragment 581 (scan-forward string start end 583 then *fragment-bitvector-strict* 584 else *fragment-bitvector-non-strict*) 586 (values i (xsubseq start i)))) 588 (defvar .pct-encoded. nil) 590 (defun scan-pct-encoded (string start end) 591 ;; This scans a single percent encoded sequence. It does no conversion. 592 ;; It also sets .pct-encoded., which is a boolean that says "this string 593 ;; has some percent encoded characters in it." 595 ;; rule 32: pct-encoded = "%" HEXDIG HEXDIG 596 (declare (fixnum start end)) 597 (and (> (the fixnum (- end start)) 2) ;; ... at least 3 chars remaining 598 (looking-at #\% string start end) 599 (looking-at *hexdig-bitvector* string (incf start) end) 600 (looking-at *hexdig-bitvector* string (incf start) end) 601 (setq .pct-encoded. start))) 603 (defun state-uri-file (string start end &aux i) 604 ;; rule: uri-file = "//" <anything> 607 ;; It's not the job of the URI parser to validate file:// URIs. 608 (when (setq i (looking-at "//" string start end)) 609 (values i (xsubseq i end)))) 611 (defun state-urn-namestring (string start end 612 &aux (i start) i2 nid nss q-component f-component 614 ;; rule 50: namestring = assigned-name 616 ;; [ "#" f-component ] 617 ;; rule 58: f-component = fragment 618 ;; START is just after "urn:". 619 ;; values: i nid nss q-component f-component r-component 620 (when (multiple-value-setq (i2 nid nss) 621 (state-urn-assigned-name string start end)) 622 (when (at-end-p i2 end) 623 (return-from state-urn-namestring (values i2 nid nss))) 626 (when (multiple-value-setq (i2 r-component q-component) 627 (state-urn-rq-components string i end)) 628 (when (at-end-p i2 end) 629 (return-from state-urn-namestring 630 (values i2 nid nss q-component nil r-component))) 632 ;; more STRING to process... 634 (when (looking-at #\# string i end) 635 (if* (multiple-value-setq (i2 f-component) 636 ;; Yes, the same fragment (RFC 8141 defines f-component in 637 ;; terms of RFC 3986's fragment). 638 (state-fragment string (incf i) end)) 640 else (setq f-component #.*uri-null-marker*))) 642 (when (at-end-p i end) 643 (values i2 nid nss q-component f-component r-component))))) 645 (defun state-urn-assigned-name (string start end &aux i i2 nid nss) 646 ;; rule 51: assigned-name = "urn" ":" NID ":" NSS 647 ;; START is just after "urn:". 649 (when (and (multiple-value-setq (i2 nid) (state-urn-nid string start end)) 650 (looking-at #\: string i2 end) 652 (multiple-value-setq (i2 nss) (state-urn-nss string i end))) 653 (values i2 nid nss))) 655 (defun state-urn-nid (string start end &aux (i start)) 656 ;; rule 52: NID = (alphanum) 0*30(ldh) (alphanum) 657 ;; rule 53: ldh = alphanum / "-" 659 (declare (fixnum start end i)) 660 (when (and (looking-at *alphanum-bitvector* string i end) 661 (setq i (scan-forward string (1+ i) end *alphanum+-bitvector*)) 662 ;; Check for <= 32 chars, thus far 663 (<= (the fixnum (- i start)) 665 ;; If the last one was alphanum, then we're done. 666 ;; If the last one was NOT alphanum, then: 667 ;; 1. make sure we had 30 chars (not 31) 668 ;; 2. look for another, single alphanum 669 (or (looking-at *alphanum-bitvector* string (1- i) end) 670 (and (<= (the fixnum (- i start)) 672 (not (at-end-p i end)) 674 (scan-forward string i end *alphanum-bitvector*))))) 675 (values i (xsubseq start i)))) 677 (defun state-urn-nss (string start end &aux i i2) 678 ;; rule 54: NSS = pchar *(pchar / "/") 680 (when (setq i (scan-pchar string start (1+ start))) 681 (if* (setq i2 (scan-forward 683 ;; See the definition of *urn-nss-chars* for 684 ;; why we don't use *pchar/-bitvector* here. 687 then (values i2 (xsubseq start i2)) 688 else (values i (xsubseq start i))))) 690 (defun state-urn-rq-components (string start end 691 &aux i ri qi r-component q-component) 692 ;; rule 55: rq-components = [ "?+" r-component ] 693 ;; [ "?=" q-component ] 694 ;; values: i r-component q-component 695 (when (and (setq i (looking-at #\? string start end)) 696 (not (at-end-p i end)) 697 (or (setq ri (looking-at #\+ string i end)) 698 (setq qi (looking-at #\= string i end))) 699 (not (at-end-p (or ri qi) end))) 700 (when (and ri (multiple-value-setq (i r-component) 701 (state-urn-r-component string ri end))) 702 (when (at-end-p i end) 703 (return-from state-urn-rq-components 704 (values i r-component))) 706 (if* (setq qi (looking-at #\? string i end)) 707 then (when (and (not (at-end-p qi end)) 708 (setq qi (looking-at #\= string qi end)) 709 (not (at-end-p qi end))) 710 (when (multiple-value-setq (i q-component) 711 (state-urn-q-component string qi end)) 712 (return-from state-urn-rq-components 713 (values i r-component q-component)))) 714 else (return-from state-urn-rq-components (values i r-component)))) 715 ;; The r-component branch didn't yield anything, check for q-component 717 (when (and qi (multiple-value-setq (i q-component) 718 (state-urn-q-component string qi end))) 719 (return-from state-urn-rq-components 720 (values i nil q-component))))) 722 (defun scan-q-component-or-pct-encoded (string i end &aux i2) 723 ;; Do what scan-pct-encoded does, BUT STOP scanning if we see "?=", 724 ;; because that is the start of the q-component. 726 ;; This function is called by SCAN-FORWARD at each character position in 729 (when (setq i2 (scan-pct-encoded string i end)) 730 (return-from scan-q-component-or-pct-encoded i2)) 732 (when (setq i2 (looking-at #\? string i end)) 734 ;; at least 2 chars remaining (for 1 char after ?=) 736 (looking-at #\= string i2 end)) 737 then ;; stop scanning 738 (return-from scan-q-component-or-pct-encoded nil) 739 else ;; return the index after the ? 740 (return-from scan-q-component-or-pct-encoded i2)))) 742 (defun state-urn-r-component (string start end &aux i i2) 743 ;; rule 56: r-component = pchar *( pchar / "/" / "?" ) 744 ;; values: i r-component 745 (when (setq i (scan-pchar string start end)) 746 (when (at-end-p i end) 747 (return-from state-urn-r-component 748 (values i (xsubseq start i)))) 753 ;; NOTE: we don't use *query-bitvector-strict* because we need 754 ;; to handle #\? specially (see the next argument). 755 *urn-query-bitvector* 756 ;; NOTE: Because r-component can contain "?" without percent 757 ;; encoding, when processing the r-component we need to 758 ;; look ahead to make sure there is no #\= after each 759 ;; #\? (since that means we have a q-component). 760 #'scan-q-component-or-pct-encoded)) 761 (values i2 (xsubseq start i2))) 763 ;; We immediately ran into ?=, so return what we found so far: 764 (t (values i (xsubseq start i)))))) 766 (defun state-urn-q-component (string start end &aux i) 767 ;; rule 57: q-component = pchar *( pchar / "/" / "?" ) 768 ;; values: i q-component 769 (when (setq i (looking-at *pchar-bitvector* string start end)) 770 (when (at-end-p i end) 771 (return-from state-urn-q-component 772 (values i (xsubseq start i)))) 774 (scan-forward string i end *query-bitvector-strict* 776 (values i (xsubseq start i)))))