changeset 387: |
8252ee515756 |
parent: |
a0dfde3cb3c4
|
author: |
Richard Westhaver <ellis@rwest.io> |
date: |
Thu, 30 May 2024 18:31:53 -0400 |
permissions: |
-rw-r--r-- |
description: |
db and readtables |
1 ;;; std/str.lisp --- String utilities 5 ;; (defvar sb-unicode-syms 6 ;; '(words lines sentences whitespace-p uppercase lowercase titlecase 7 ;; word-break-class line-break-class sentence-break-class char-block 8 ;; cased-p uppercase-p lowercase-p titlecase-p casefold 9 ;; graphemes grapheme-break-class 10 ;; bidi-mirroring-glyph bidi-class 11 ;; normalize-string normalized-p default-ignorable-p 12 ;; confusable-p hex-digit-p mirrored-p alphabetic-p math-p 13 ;; decimal-value digit-value 14 ;; unicode< unicode> unicode= unicode-equal 15 ;; unicode<= unicode>=)) 16 (in-package :std/string) 18 ;; (mapc (lambda (s) (export s)) sb-unicode-syms) 21 ;; :include sb-unicode-syms) 23 (defparameter *omit-nulls* nil) 24 (defvar *whitespaces* (list #\Backspace #\Tab #\Linefeed #\Newline #\Vt #\Page 25 #\Return #\Space #\Rubout 26 #+sbcl #\Next-Line #-sbcl (code-char 133) 28 "On some implementations, linefeed and newline represent the same character (code).") 30 (deftype string-designator () 31 "A string designator type. A string designator is either a string, a symbol, 33 `(or symbol string character)) 35 (defun ssplit (separator s &key (omit-nulls *omit-nulls*)) 36 "Split s into substring by separator (cl-ppcre takes a regex, we do not). 38 `limit' limits the number of elements returned (i.e. the string is 39 split at most `limit' - 1 times)." 40 ;; cl-ppcre:split doesn't return a null string if the separator appears at the end of s. 41 (let* ((res (cl-ppcre:split separator s))) 43 (remove-if (lambda (it) (sequence:emptyp it)) res) 46 (defun collapse-whitespaces (s) 47 "Ensure there is only one space character between words. 49 (cl-ppcre:regex-replace-all "\\s+" s " ")) 51 (defun trim (s &key (char-bag *whitespaces*)) 52 "Removes all characters in `char-bag` (default: whitespaces) at the beginning and end of `s`. 53 If supplied, char-bag has to be a sequence (e.g. string or list of characters). 55 Examples: (trim \" foo \") => \"foo\" 56 (trim \"+-*foo-bar*-+\" :char-bag \"+-*\") => \"foo-bar\" 57 (trim \"afood\" :char-bag (str:concat \"a\" \"d\")) => \"foo\"" 59 (string-trim char-bag s))) 61 ;;; TODO 2023-08-27: camel snake kebab 63 (defun make-template-parser (start-delimiter end-delimiter &key (ignore-case nil)) 64 "Returns a closure than can substitute variables 65 delimited by \"start-delimiter\" and \"end-delimiter\" 66 in a string, by the provided values." 67 (check-type start-delimiter string) 68 (check-type end-delimiter string) 69 (when (or (string= start-delimiter "") 70 (string= end-delimiter "")) 71 (error 'simple-type-error 72 :format-control "The empty string is not a valid delimiter.")) 73 (let ((start-len (length start-delimiter)) 74 (end-len (length end-delimiter)) 79 (lambda (string values) 80 (check-type string string) 81 (unless (listp values) 82 (error 'simple-type-error 83 :format-control "values should be an association list")) 85 (with-output-to-string (stream) 86 (loop for prev = 0 then (+ j end-len) 87 for i = (search start-delimiter string) 88 then (search start-delimiter string :start2 j) 89 for j = (if i (search end-delimiter string :start2 i)) 90 then (if i (search end-delimiter string :start2 i)) 92 do (write-string (subseq string prev i) stream) 93 (let ((instance (rest (assoc (subseq string (+ i start-len) j) 97 (princ instance stream) 98 (write-string (subseq string i (+ j end-len)) stream))) 100 finally (write-string (subseq string prev) stream)))))) 103 ;;; Implementing an efficient string= case in Common Lisp 105 ;;; 2015-11-15: Defknown don't have explicit-check in SBCL 1.3.0 106 ;;; Remove the declaration. It's never useful the way we use 109 ;;; 2015-11-15: Make this a real ASDF system for Xach 110 ;;; I copied the system definition from Quicklisp and mangled as 113 ;;; 2010-06-30: Tiny bugfix 114 ;;; Widen the type declarations inside cases to allow vectors that 115 ;;; have a length that's shorter than the total size (due to fill- 121 ;;; In `<http://neverfriday.com/blog/?p=10>', OMouse asks how 122 ;;; best to implement a `string= case' (in Scheme). I noted that 123 ;;; naively iterating through the cases with `string=' at runtime 124 ;;; is suboptimal. Seeing the problem as a simplistic pattern 125 ;;; matching one makes an efficient solution obvious. 126 ;;; Note that, unlike Haskell, both Scheme and CL have random- 127 ;;; access on strings in O(1), something which I exploit to 128 ;;; generate better code. 130 ;;; This is also a pbook.el file (the pdf can be found at 131 ;;; `<http://www.discontinuity.info/~pkhuong/string-case.pdf>' ). 132 ;;; I'm new at this not-quite-illiterate programming thing, so 133 ;;; please bear with me (: I'm also looking for comments on the 134 ;;; formatting. I'm particularly iffy with the way keywords look 135 ;;; like. It just looks really fuzzy when you're not really zoomed 136 ;;; in (or reading it on paper). 138 ;;; I usually don't use packages for throw-away code, but this looks 139 ;;; like it could be useful to someone. 141 ;;;# Some utility code 143 (defun split-tree (list &key (test 'eql) (key 'identity)) 144 "Splits input list into sublists of elements 145 whose elements are all such that (key element) 147 It's assumed that test and key form an equality class. 148 (This is similar to groupBy)" 151 (cur-list (list (first list))) 152 (cur-key (funcall key (first list)))) 153 (dolist (elt (rest list) (nreverse (cons (nreverse cur-list) 155 (let ((new-key (funcall key elt))) 156 (if (funcall test cur-key new-key) 159 (push (nreverse cur-list) lists) 160 (setf cur-list (list elt) 161 cur-key new-key)))))))) 164 (loop for i below n collect i)) 166 (defun hash-table->list (table &key (keep-keys t) (keep-values t)) 167 "Saves the keys and/or values in table to a list. 168 As with hash table iterating functions, there is no 171 (maphash (cond ((and keep-keys 174 (push (cons k v) list))) 186 (defun all-equal (list &key (key 'identity) (test 'eql)) 190 (let ((first-key (funcall key (first list)))) 191 (every (lambda (element) 192 (funcall test first-key 193 (funcall key element))) 196 (defun split-at (list n) 197 "Split list in k lists of n elements (or less for the last list)" 198 (declare (type (and fixnum (integer (0))) n)) 202 (declare (type (and fixnum unsigned-byte) counter)) 203 (dolist (elt list (nreverse (if cur-list 204 (cons (nreverse cur-list) 208 (when (= (incf counter) n) 209 (push (nreverse cur-list) lists) 213 ;;;# The string matching compiler per se 215 ;;; I use special variables here because I find that 216 ;;; preferable to introducing noise everywhere to thread 217 ;;; these values through all the calls, especially 218 ;;; when `*no-match-form*' is only used at the very end. 220 (defparameter *input-string* nil 221 "Symbol of the variable holding the input string") 223 (defparameter *no-match-form* nil 224 "Form to insert when no match is found.") 226 ;;; The basic idea of the pattern matching process here is 227 ;;; to first discriminate with the input string's length; 228 ;;; once that is done, it is very easy to safely use random 229 ;;; access until only one candidate string (pattern) remains. 230 ;;; However, even if we determine that only one case might be 231 ;;; a candidate, it might still be possible for another string 232 ;;; (not in the set of cases) to match the criteria. So we also 233 ;;; have to make sure that *all* the indices match. A simple 234 ;;; way to do this would be to emit the remaining checks at the 235 ;;; every end, when only one candidate is left. However, that 236 ;;; can result in a lot of duplicate code, and some useless 237 ;;; work on mismatches. Instead, the code generator always 238 ;;; tries to find (new) indices for which all the candidates 239 ;;; left in the branch share the same character, and then emits 240 ;;; a guard, checking the character at that index as soon as possible. 242 ;;; In my experience, there are two main problems when writing 243 ;;; pattern matchers: how to decide what to test for at each 244 ;;; fork, and how to ensure the code won't explode exponentially. 245 ;;; Luckily, for our rather restricted pattern language (equality 246 ;;; on strings), patterns can't overlap, and it's possible to guarantee 247 ;;; that no candidate will ever be possible in both branches of a 250 ;;; Due to the the latter guarantee, we have a simple fitness 251 ;;; measure for tests: simply maximising the number of 252 ;;; candidates in the smallest branch will make our search tree 253 ;;; as balanced as possible. Of course, we don't know whether 254 ;;; the subtrees will be balanced too, but I don't think it'll 255 ;;; be much of an issue. 257 ;;; Note that, if we had access, whether via annotations or profiling, 258 ;;; to the probability of each case, the situation would be very 259 ;;; different. In fact, on a pipelined machine where branch 260 ;;; mispredictions are expensive, an unbalanced tree will yield 261 ;;; better expected runtimes. There was a very interesting and rather 262 ;;; sophisticated Google lecture on that topic on Google video 263 ;;; (the speaker used markov chains to model dynamic predictors, 264 ;;; for example), but I can't seem to find the URL. 266 ;;; TODO: Find bounds on the size of the code! 268 (defun find-best-split (strings to-check) 269 "Iterate over all the indices left to check to find 270 which index (and which character) to test for equality 271 with, keeping the ones which result in the most balanced 273 (flet ((evaluate-split (i char) 274 "Just count all the matches and mismatches" 277 (dolist (string strings (min = /=)) 278 (if (eql (aref string i) char) 281 (uniquify-chars (chars) 282 "Only keep one copy of each char in the list" 283 (mapcar 'first (split-tree (sort chars 'char<) :test #'eql)))) 284 (let ((best-split 0) ; maximise size of smallest branch 287 (dolist (i to-check (values best-posn best-char)) 288 (dolist (char (uniquify-chars (mapcar (lambda (string) 291 (let ((Z (evaluate-split i char))) 292 (when (> Z best-split) 295 best-char char)))))))) 297 ;;; We sometimes have to execute sequences of checks for 298 ;;; equality. The natural way to express this is via a 299 ;;; sequence of checks, wrapped in an `and'. However, that 300 ;;; translates to a sequence of conditional branches, predicated 301 ;;; on very short computations. On (not so) modern architectures, 302 ;;; it'll be faster to coalesce a sequence of such checks together 303 ;;; as straightline code (e.g. via `or' of `xor'), and only branch 304 ;;; at the very end. The code doesn't become much more complex, 305 ;;; and benchmarks have shown it to be beneficial (giving a speed 306 ;;; up of 2-5% for both predictable and unpredictable workloads, 309 ;;; Benchmarks (and experience) have shown that, instead of executing 310 ;;; a cascade of comparison/conditional branch, it's slightly 311 ;;; faster, both for predictable and unpredictable workloads, 312 ;;; to `or' together a bunch of comparisons (e.g. `xor'). On a Core 2 313 ;;; processor, it seems that doing so for sequences of around 4 314 ;;; comparisons is the sweetspot. On perfectly predictable input, 315 ;;; aborting early (on the first check) saves as much time as 316 ;;; the 4 test/conditional branch add, compared to a sequence of 319 ;;; Numeric char= abstracts out the xor check, and, on SBCL, 320 ;;; is replaced by a short assembly sequence when the first 321 ;;; argument is a constant. The declared return type is then 322 ;;; wider than strictly necessary making it fit in a machine 323 ;;; register, but not as a fixnum ensures that the compiler 324 ;;; won't repeatedly convert the values to fixnums, when all 325 ;;; we'll do is `or' them together and check for zero-ness. 326 ;;; This function is the only place where the macro isn't 327 ;;; generic over the elements stored in the cases. It shouldn't 328 ;;; be too hard to implement a numeric-eql, which would 329 ;;; restore genericity to the macro, while keeping the 333 ;; (defknown numeric-char= (character character) 334 ;; (unsigned-byte #. (1- sb-vm:n-machine-word-bits)) 335 ;; (movable foldable flushable)) 337 ;; (define-vop (numeric-char=) 338 ;; (:args (x :scs (sb-vm::character-reg sb-vm::character-stack) 340 ;; :load-if (not (location= x r)))) 342 ;; (:arg-types (:constant character) character) 343 ;; (:results (r :scs (sb-vm::unsigned-reg) 344 ;; :load-if (not (location= x r)))) 345 ;; (:result-types sb-vm::unsigned-num) 346 ;; (:translate numeric-char=) 347 ;; (:policy :fast-safe) 348 ;; (:note "inline constant numeric-char=") 351 ;; (sb-vm::inst #:xor r (char-code y))))) 353 #+ (and sbcl (or x86 x86-64)) 354 (defun numeric-char= (x y) 355 (declare (type character x y)) 356 (logxor (char-code x) 359 ;;; At each step, we may be able to find positions for which 360 ;;; there can only be one character. If we emit the check for 361 ;;; these positions as soon as possible, we avoid duplicating 362 ;;; potentially a lot of code. Since benchmarks have shown 363 ;;; it to be useful, this function implements the checks 364 ;;; as a series of (zerop (logior (numeric-char= ...)...)), 365 ;;; if there is more than one such check to emit. 367 (defun emit-common-checks (strings to-check) 368 (labels ((emit-char= (pairs) 369 (mapcar (lambda (pair) 370 (destructuring-bind (posn . char) 372 `(numeric-char= ,char 373 (aref ,*input-string* ,posn)))) 375 (emit-checking-form (common-chars) 377 (let ((common-chars (sort common-chars '< :key 'car))) 378 #+ (and) `(and ,@(mapcar 380 (if (null (rest chunk)) 381 (destructuring-bind ((posn . char)) 384 (aref ,*input-string* ,posn))) 386 (logior ,@(emit-char= chunk))))) 387 (split-at common-chars 4))) 388 #+ (or) `(and ,@(mapcar 390 (destructuring-bind (posn . char) 393 (aref ,*input-string* ,posn)))) 395 (let ((common-chars ()) 397 (dolist (posn to-check (values (emit-checking-form common-chars) 398 (nreverse left-to-check))) 399 (if (all-equal strings :key (lambda (string) 401 (push (cons posn (aref (first strings) posn)) 403 (push posn left-to-check)))))) 405 ;;; The driving function: First, emit any test that is 406 ;;; common to all the candidates. If there's only one 407 ;;; candidate, then we just have to execute the body; 408 ;;; if not, we look for the `best' test and emit the 409 ;;; corresponding code: execute the test, and recurse 410 ;;; on the candidates that match the test and on those 413 (defun make-search-tree (strings bodies to-check) 414 (multiple-value-bind (guard to-check) 415 (emit-common-checks strings to-check) 416 (if (null (rest strings)) 418 (assert (null to-check)) ; there shouldn't be anything left to check 421 (progn ,@(first bodies)) 423 `(progn ,@(first bodies)))) 424 (multiple-value-bind (posn char) 425 (find-best-split strings to-check) 426 (assert posn) ; this can only happen if all strings are equal 432 for string in strings 434 do (if (eql char (aref string posn)) 436 (push string =strings) 439 (push string /=strings) 440 (push body /=bodies)))) 441 (let ((tree `(if (eql ,char (aref ,*input-string* ,posn)) 442 ,(make-search-tree =strings =bodies 443 (remove posn to-check)) 444 ,(make-search-tree /=strings /=bodies 452 ;;; Finally, we can glue it all together. 453 ;;; To recapitulate, first, dispatch on string 454 ;;; length, then execute a search tree for the 455 ;;; few candidates left, and finally make sure 456 ;;; the input string actually matches the one 457 ;;; candidate left at the leaf. 459 (defun emit-string-case (cases input-var no-match) 460 (flet ((case-string-length (x) 462 (let ((*input-string* input-var) 463 (*no-match-form* no-match) 464 (cases-lists (split-tree (sort cases '< 465 :key #'case-string-length) 466 :key #'case-string-length))) 467 `(locally (declare (type vector ,input-var)) 468 (case (length ,input-var) 469 ,@(loop for cases in cases-lists 470 for length = (case-string-length (first cases)) 472 ;; arrays with fill pointers expose the total length 473 ;; in their type, not the position of the fill-pointer. 474 ;; The type below only applies to simple-arrays. 475 (locally (declare (type (or (not simple-array) 476 (simple-array * (,length))) 478 ,(make-search-tree (mapcar 'first cases) 483 ;;; Just wrapping the previous function in a macro, 484 ;;; and adding some error checking (the rest of the code 485 ;;; just assumes there won't be duplicate patterns). 486 ;;; Note how we use a local function instead of passing 487 ;;; the default form directly. This can save a lot on 488 ;;; code size, especially when the default form is 491 (defmacro string-case ((string &key (default '(error "No match"))) 493 "(string-case (string &key default) 495 case ::= string form* 497 Where t is the default case." 498 (let ((cases-table (make-hash-table :test 'equal))) 499 "Error checking cruft" 501 (assert (typep case '(cons (or string (eql t))))) 502 (let ((other-case (gethash (first case) cases-table))) 504 (warn "Duplicate string-case cases: ~A -> ~A or ~A~%" 508 (setf (gethash (first case) cases-table) 510 (let ((input-var (gensym "INPUT")) 511 (default-fn (gensym "ON-ERROR")) 512 (default-body (gethash t cases-table (list default)))) 513 `(let ((,input-var ,string)) 514 (flet ((,default-fn () 516 ,(emit-string-case (progn 517 (remhash t cases-table) 518 (hash-table->list cases-table))