changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / lisp/std/string.lisp

changeset 658: 804b5ee20a46
parent: a0dfde3cb3c4
author: Richard Westhaver <ellis@rwest.io>
date: Thu, 19 Sep 2024 23:23:02 -0400
permissions: -rw-r--r--
description: zstd completed (besides zdict), working on readline
1 ;;; std/str.lisp --- String utilities
2 
3 ;;; Code:
4 
5 ;; (defvar sb-unicode-syms
6 ;; '(words lines sentences whitespace-p uppercase lowercase titlecase
7 ;; word-break-class line-break-class sentence-break-class char-block
8 ;; cased-p uppercase-p lowercase-p titlecase-p casefold
9 ;; graphemes grapheme-break-class
10 ;; bidi-mirroring-glyph bidi-class
11 ;; normalize-string normalized-p default-ignorable-p
12 ;; confusable-p hex-digit-p mirrored-p alphabetic-p math-p
13 ;; decimal-value digit-value
14 ;; unicode< unicode> unicode= unicode-equal
15 ;; unicode<= unicode>=))
16 (in-package :std/string)
17 
18 ;; (mapc (lambda (s) (export s)) sb-unicode-syms)
19 ;; (reexport-from
20 ;; :sb-unicode
21 ;; :include sb-unicode-syms)
22 
23 (defparameter *omit-nulls* nil)
24 (defvar *whitespaces* (list #\Backspace #\Tab #\Linefeed #\Newline #\Vt #\Page
25  #\Return #\Space #\Rubout
26  #+sbcl #\Next-Line #-sbcl (code-char 133)
27  #\No-break_space)
28  "On some implementations, linefeed and newline represent the same character (code).")
29 
30 (deftype string-designator ()
31  "A string designator type. A string designator is either a string, a symbol,
32 or a character."
33  `(or symbol string character))
34 
35 (defun ssplit (separator s &key (omit-nulls *omit-nulls*))
36  "Split s into substring by separator (cl-ppcre takes a regex, we do not).
37 
38  `limit' limits the number of elements returned (i.e. the string is
39  split at most `limit' - 1 times)."
40  ;; cl-ppcre:split doesn't return a null string if the separator appears at the end of s.
41  (let* ((res (cl-ppcre:split separator s)))
42  (if omit-nulls
43  (remove-if (lambda (it) (sequence:emptyp it)) res)
44  res)))
45 
46 (defun collapse-whitespaces (s)
47  "Ensure there is only one space character between words.
48  Remove newlines."
49  (cl-ppcre:regex-replace-all "\\s+" s " "))
50 
51 (defun trim (s &key (char-bag *whitespaces*))
52  "Removes all characters in `char-bag` (default: whitespaces) at the beginning and end of `s`.
53  If supplied, char-bag has to be a sequence (e.g. string or list of characters).
54 
55  Examples: (trim \" foo \") => \"foo\"
56  (trim \"+-*foo-bar*-+\" :char-bag \"+-*\") => \"foo-bar\"
57  (trim \"afood\" :char-bag (str:concat \"a\" \"d\")) => \"foo\""
58  (when s
59  (string-trim char-bag s)))
60 
61 ;;; TODO 2023-08-27: camel snake kebab
62 
63 (defun make-template-parser (start-delimiter end-delimiter &key (ignore-case nil))
64  "Returns a closure than can substitute variables
65  delimited by \"start-delimiter\" and \"end-delimiter\"
66  in a string, by the provided values."
67  (check-type start-delimiter string)
68  (check-type end-delimiter string)
69  (when (or (string= start-delimiter "")
70  (string= end-delimiter ""))
71  (error 'simple-type-error
72  :format-control "The empty string is not a valid delimiter."))
73  (let ((start-len (length start-delimiter))
74  (end-len (length end-delimiter))
75  (test (if ignore-case
76  #'string-equal
77  #'string=)))
78 
79  (lambda (string values)
80  (check-type string string)
81  (unless (listp values)
82  (error 'simple-type-error
83  :format-control "values should be an association list"))
84 
85  (with-output-to-string (stream)
86  (loop for prev = 0 then (+ j end-len)
87  for i = (search start-delimiter string)
88  then (search start-delimiter string :start2 j)
89  for j = (if i (search end-delimiter string :start2 i))
90  then (if i (search end-delimiter string :start2 i))
91  while (and i j)
92  do (write-string (subseq string prev i) stream)
93  (let ((instance (rest (assoc (subseq string (+ i start-len) j)
94  values
95  :test test))))
96  (if instance
97  (princ instance stream)
98  (write-string (subseq string i (+ j end-len)) stream)))
99 
100  finally (write-string (subseq string prev) stream))))))
101 
102 ;;; STRING-CASE
103 ;;; Implementing an efficient string= case in Common Lisp
104 ;;;
105 ;;; 2015-11-15: Defknown don't have explicit-check in SBCL 1.3.0
106 ;;; Remove the declaration. It's never useful the way we use
107 ;;; numeric-char=.
108 ;;;
109 ;;; 2015-11-15: Make this a real ASDF system for Xach
110 ;;; I copied the system definition from Quicklisp and mangled as
111 ;;; necessary.
112 ;;;
113 ;;; 2010-06-30: Tiny bugfix
114 ;;; Widen the type declarations inside cases to allow vectors that
115 ;;; have a length that's shorter than the total size (due to fill-
116 ;;; pointers).
117 
118 ;;;
119 ;;;# Introduction
120 ;;;
121 ;;; In `<http://neverfriday.com/blog/?p=10>', OMouse asks how
122 ;;; best to implement a `string= case' (in Scheme). I noted that
123 ;;; naively iterating through the cases with `string=' at runtime
124 ;;; is suboptimal. Seeing the problem as a simplistic pattern
125 ;;; matching one makes an efficient solution obvious.
126 ;;; Note that, unlike Haskell, both Scheme and CL have random-
127 ;;; access on strings in O(1), something which I exploit to
128 ;;; generate better code.
129 ;;;
130 ;;; This is also a pbook.el file (the pdf can be found at
131 ;;; `<http://www.discontinuity.info/~pkhuong/string-case.pdf>' ).
132 ;;; I'm new at this not-quite-illiterate programming thing, so
133 ;;; please bear with me (: I'm also looking for comments on the
134 ;;; formatting. I'm particularly iffy with the way keywords look
135 ;;; like. It just looks really fuzzy when you're not really zoomed
136 ;;; in (or reading it on paper).
137 
138 ;;; I usually don't use packages for throw-away code, but this looks
139 ;;; like it could be useful to someone.
140 
141 ;;;# Some utility code
142 
143 (defun split-tree (list &key (test 'eql) (key 'identity))
144  "Splits input list into sublists of elements
145  whose elements are all such that (key element)
146  are all test.
147  It's assumed that test and key form an equality class.
148  (This is similar to groupBy)"
149  (when list
150  (let* ((lists ())
151  (cur-list (list (first list)))
152  (cur-key (funcall key (first list))))
153  (dolist (elt (rest list) (nreverse (cons (nreverse cur-list)
154  lists)))
155  (let ((new-key (funcall key elt)))
156  (if (funcall test cur-key new-key)
157  (push elt cur-list)
158  (progn
159  (push (nreverse cur-list) lists)
160  (setf cur-list (list elt)
161  cur-key new-key))))))))
162 
163 (defun iota (n)
164  (loop for i below n collect i))
165 
166 (defun hash-table->list (table &key (keep-keys t) (keep-values t))
167  "Saves the keys and/or values in table to a list.
168  As with hash table iterating functions, there is no
169  implicit ordering."
170  (let ((list ()))
171  (maphash (cond ((and keep-keys
172  keep-values)
173  (lambda (k v)
174  (push (cons k v) list)))
175  (keep-keys
176  (lambda (k v)
177  (declare (ignore v))
178  (push k list)))
179  (keep-values
180  (lambda (k v)
181  (declare (ignore k))
182  (push v list))))
183  table)
184  list))
185 
186 (defun all-equal (list &key (key 'identity) (test 'eql))
187  (if (or (null list)
188  (null (rest list)))
189  t
190  (let ((first-key (funcall key (first list))))
191  (every (lambda (element)
192  (funcall test first-key
193  (funcall key element)))
194  (rest list)))))
195 
196 (defun split-at (list n)
197  "Split list in k lists of n elements (or less for the last list)"
198  (declare (type (and fixnum (integer (0))) n))
199  (let ((lists '())
200  (cur-list '())
201  (counter 0))
202  (declare (type (and fixnum unsigned-byte) counter))
203  (dolist (elt list (nreverse (if cur-list
204  (cons (nreverse cur-list)
205  lists)
206  lists)))
207  (push elt cur-list)
208  (when (= (incf counter) n)
209  (push (nreverse cur-list) lists)
210  (setf cur-list '()
211  counter 0)))))
212 
213 ;;;# The string matching compiler per se
214 ;;;
215 ;;; I use special variables here because I find that
216 ;;; preferable to introducing noise everywhere to thread
217 ;;; these values through all the calls, especially
218 ;;; when `*no-match-form*' is only used at the very end.
219 
220 (defparameter *input-string* nil
221  "Symbol of the variable holding the input string")
222 
223 (defparameter *no-match-form* nil
224  "Form to insert when no match is found.")
225 
226 ;;; The basic idea of the pattern matching process here is
227 ;;; to first discriminate with the input string's length;
228 ;;; once that is done, it is very easy to safely use random
229 ;;; access until only one candidate string (pattern) remains.
230 ;;; However, even if we determine that only one case might be
231 ;;; a candidate, it might still be possible for another string
232 ;;; (not in the set of cases) to match the criteria. So we also
233 ;;; have to make sure that *all* the indices match. A simple
234 ;;; way to do this would be to emit the remaining checks at the
235 ;;; every end, when only one candidate is left. However, that
236 ;;; can result in a lot of duplicate code, and some useless
237 ;;; work on mismatches. Instead, the code generator always
238 ;;; tries to find (new) indices for which all the candidates
239 ;;; left in the branch share the same character, and then emits
240 ;;; a guard, checking the character at that index as soon as possible.
241 
242 ;;; In my experience, there are two main problems when writing
243 ;;; pattern matchers: how to decide what to test for at each
244 ;;; fork, and how to ensure the code won't explode exponentially.
245 ;;; Luckily, for our rather restricted pattern language (equality
246 ;;; on strings), patterns can't overlap, and it's possible to guarantee
247 ;;; that no candidate will ever be possible in both branches of a
248 ;;; fork.
249 
250 ;;; Due to the the latter guarantee, we have a simple fitness
251 ;;; measure for tests: simply maximising the number of
252 ;;; candidates in the smallest branch will make our search tree
253 ;;; as balanced as possible. Of course, we don't know whether
254 ;;; the subtrees will be balanced too, but I don't think it'll
255 ;;; be much of an issue.
256 
257 ;;; Note that, if we had access, whether via annotations or profiling,
258 ;;; to the probability of each case, the situation would be very
259 ;;; different. In fact, on a pipelined machine where branch
260 ;;; mispredictions are expensive, an unbalanced tree will yield
261 ;;; better expected runtimes. There was a very interesting and rather
262 ;;; sophisticated Google lecture on that topic on Google video
263 ;;; (the speaker used markov chains to model dynamic predictors,
264 ;;; for example), but I can't seem to find the URL.
265 
266 ;;; TODO: Find bounds on the size of the code!
267 
268 (defun find-best-split (strings to-check)
269  "Iterate over all the indices left to check to find
270  which index (and which character) to test for equality
271  with, keeping the ones which result in the most balanced
272  split."
273  (flet ((evaluate-split (i char)
274  "Just count all the matches and mismatches"
275  (let ((= 0)
276  (/= 0))
277  (dolist (string strings (min = /=))
278  (if (eql (aref string i) char)
279  (incf =)
280  (incf /=)))))
281  (uniquify-chars (chars)
282  "Only keep one copy of each char in the list"
283  (mapcar 'first (split-tree (sort chars 'char<) :test #'eql))))
284  (let ((best-split 0) ; maximise size of smallest branch
285  (best-posn nil)
286  (best-char nil))
287  (dolist (i to-check (values best-posn best-char))
288  (dolist (char (uniquify-chars (mapcar (lambda (string)
289  (aref string i))
290  strings)))
291  (let ((Z (evaluate-split i char)))
292  (when (> Z best-split)
293  (setf best-split Z
294  best-posn i
295  best-char char))))))))
296 
297 ;;; We sometimes have to execute sequences of checks for
298 ;;; equality. The natural way to express this is via a
299 ;;; sequence of checks, wrapped in an `and'. However, that
300 ;;; translates to a sequence of conditional branches, predicated
301 ;;; on very short computations. On (not so) modern architectures,
302 ;;; it'll be faster to coalesce a sequence of such checks together
303 ;;; as straightline code (e.g. via `or' of `xor'), and only branch
304 ;;; at the very end. The code doesn't become much more complex,
305 ;;; and benchmarks have shown it to be beneficial (giving a speed
306 ;;; up of 2-5% for both predictable and unpredictable workloads,
307 ;;; on a Core 2).
308 
309 ;;; Benchmarks (and experience) have shown that, instead of executing
310 ;;; a cascade of comparison/conditional branch, it's slightly
311 ;;; faster, both for predictable and unpredictable workloads,
312 ;;; to `or' together a bunch of comparisons (e.g. `xor'). On a Core 2
313 ;;; processor, it seems that doing so for sequences of around 4
314 ;;; comparisons is the sweetspot. On perfectly predictable input,
315 ;;; aborting early (on the first check) saves as much time as
316 ;;; the 4 test/conditional branch add, compared to a sequence of
317 ;;; `xor' and `or'.
318 
319 ;;; Numeric char= abstracts out the xor check, and, on SBCL,
320 ;;; is replaced by a short assembly sequence when the first
321 ;;; argument is a constant. The declared return type is then
322 ;;; wider than strictly necessary making it fit in a machine
323 ;;; register, but not as a fixnum ensures that the compiler
324 ;;; won't repeatedly convert the values to fixnums, when all
325 ;;; we'll do is `or' them together and check for zero-ness.
326 ;;; This function is the only place where the macro isn't
327 ;;; generic over the elements stored in the cases. It shouldn't
328 ;;; be too hard to implement a numeric-eql, which would
329 ;;; restore genericity to the macro, while keeping the
330 ;;; speed-up.
331 
332 ;; (progn
333 ;; (defknown numeric-char= (character character)
334 ;; (unsigned-byte #. (1- sb-vm:n-machine-word-bits))
335 ;; (movable foldable flushable))
336 
337 ;; (define-vop (numeric-char=)
338 ;; (:args (x :scs (sb-vm::character-reg sb-vm::character-stack)
339 ;; :target r
340 ;; :load-if (not (location= x r))))
341 ;; (:info y)
342 ;; (:arg-types (:constant character) character)
343 ;; (:results (r :scs (sb-vm::unsigned-reg)
344 ;; :load-if (not (location= x r))))
345 ;; (:result-types sb-vm::unsigned-num)
346 ;; (:translate numeric-char=)
347 ;; (:policy :fast-safe)
348 ;; (:note "inline constant numeric-char=")
349 ;; (:generator 1
350 ;; (move r x)
351 ;; (sb-vm::inst #:xor r (char-code y)))))
352 
353 #+ (and sbcl (or x86 x86-64))
354 (defun numeric-char= (x y)
355  (declare (type character x y))
356  (logxor (char-code x)
357  (char-code y)))
358 
359 ;;; At each step, we may be able to find positions for which
360 ;;; there can only be one character. If we emit the check for
361 ;;; these positions as soon as possible, we avoid duplicating
362 ;;; potentially a lot of code. Since benchmarks have shown
363 ;;; it to be useful, this function implements the checks
364 ;;; as a series of (zerop (logior (numeric-char= ...)...)),
365 ;;; if there is more than one such check to emit.
366 
367 (defun emit-common-checks (strings to-check)
368  (labels ((emit-char= (pairs)
369  (mapcar (lambda (pair)
370  (destructuring-bind (posn . char)
371  pair
372  `(numeric-char= ,char
373  (aref ,*input-string* ,posn))))
374  pairs))
375  (emit-checking-form (common-chars)
376  (when common-chars
377  (let ((common-chars (sort common-chars '< :key 'car)))
378  #+ (and) `(and ,@(mapcar
379  (lambda (chunk)
380  (if (null (rest chunk))
381  (destructuring-bind ((posn . char))
382  chunk
383  `(eql ,char
384  (aref ,*input-string* ,posn)))
385  `(zerop
386  (logior ,@(emit-char= chunk)))))
387  (split-at common-chars 4)))
388  #+ (or) `(and ,@(mapcar
389  (lambda (pair)
390  (destructuring-bind (posn . char)
391  pair
392  `(eql ,char
393  (aref ,*input-string* ,posn))))
394  common-chars))))))
395  (let ((common-chars ())
396  (left-to-check ()))
397  (dolist (posn to-check (values (emit-checking-form common-chars)
398  (nreverse left-to-check)))
399  (if (all-equal strings :key (lambda (string)
400  (aref string posn)))
401  (push (cons posn (aref (first strings) posn))
402  common-chars)
403  (push posn left-to-check))))))
404 
405 ;;; The driving function: First, emit any test that is
406 ;;; common to all the candidates. If there's only one
407 ;;; candidate, then we just have to execute the body;
408 ;;; if not, we look for the `best' test and emit the
409 ;;; corresponding code: execute the test, and recurse
410 ;;; on the candidates that match the test and on those
411 ;;; that don't.
412 
413 (defun make-search-tree (strings bodies to-check)
414  (multiple-value-bind (guard to-check)
415  (emit-common-checks strings to-check)
416  (if (null (rest strings))
417  (progn
418  (assert (null to-check)) ; there shouldn't be anything left to check
419  (if guard
420  `(if ,guard
421  (progn ,@(first bodies))
422  ,*no-match-form*)
423  `(progn ,@(first bodies))))
424  (multiple-value-bind (posn char)
425  (find-best-split strings to-check)
426  (assert posn) ; this can only happen if all strings are equal
427  (let ((=strings ())
428  (=bodies ())
429  (/=strings ())
430  (/=bodies ()))
431  (loop
432  for string in strings
433  for body in bodies
434  do (if (eql char (aref string posn))
435  (progn
436  (push string =strings)
437  (push body =bodies))
438  (progn
439  (push string /=strings)
440  (push body /=bodies))))
441  (let ((tree `(if (eql ,char (aref ,*input-string* ,posn))
442  ,(make-search-tree =strings =bodies
443  (remove posn to-check))
444  ,(make-search-tree /=strings /=bodies
445  to-check))))
446  (if guard
447  `(if ,guard
448  ,tree
449  ,*no-match-form*)
450  tree)))))))
451 
452 ;;; Finally, we can glue it all together.
453 ;;; To recapitulate, first, dispatch on string
454 ;;; length, then execute a search tree for the
455 ;;; few candidates left, and finally make sure
456 ;;; the input string actually matches the one
457 ;;; candidate left at the leaf.
458 
459 (defun emit-string-case (cases input-var no-match)
460  (flet ((case-string-length (x)
461  (length (first x))))
462  (let ((*input-string* input-var)
463  (*no-match-form* no-match)
464  (cases-lists (split-tree (sort cases '<
465  :key #'case-string-length)
466  :key #'case-string-length)))
467  `(locally (declare (type vector ,input-var))
468  (case (length ,input-var)
469  ,@(loop for cases in cases-lists
470  for length = (case-string-length (first cases))
471  collect `((,length)
472  ;; arrays with fill pointers expose the total length
473  ;; in their type, not the position of the fill-pointer.
474  ;; The type below only applies to simple-arrays.
475  (locally (declare (type (or (not simple-array)
476  (simple-array * (,length)))
477  ,input-var))
478  ,(make-search-tree (mapcar 'first cases)
479  (mapcar 'rest cases)
480  (iota length)))))
481  (t ,no-match))))))
482 
483 ;;; Just wrapping the previous function in a macro,
484 ;;; and adding some error checking (the rest of the code
485 ;;; just assumes there won't be duplicate patterns).
486 ;;; Note how we use a local function instead of passing
487 ;;; the default form directly. This can save a lot on
488 ;;; code size, especially when the default form is
489 ;;; large.
490 
491 (defmacro string-case ((string &key (default '(error "No match")))
492  &body cases)
493  "(string-case (string &key default)
494  case*)
495  case ::= string form*
496  | t form*
497  Where t is the default case."
498  (let ((cases-table (make-hash-table :test 'equal)))
499  "Error checking cruft"
500  (dolist (case cases)
501  (assert (typep case '(cons (or string (eql t)))))
502  (let ((other-case (gethash (first case) cases-table)))
503  (if other-case
504  (warn "Duplicate string-case cases: ~A -> ~A or ~A~%"
505  (first case)
506  (rest other-case)
507  (rest case))
508  (setf (gethash (first case) cases-table)
509  (rest case)))))
510  (let ((input-var (gensym "INPUT"))
511  (default-fn (gensym "ON-ERROR"))
512  (default-body (gethash t cases-table (list default))))
513  `(let ((,input-var ,string))
514  (flet ((,default-fn ()
515  ,@default-body))
516  ,(emit-string-case (progn
517  (remhash t cases-table)
518  (hash-table->list cases-table))
519  input-var
520  `(,default-fn)))))))