core:lisp/std/string.lisp

changelog shortlog graph tags branches changeset files revisions annotate raw help
Mercurial > core / lisp/std/string.lisp

changeset 658:	804b5ee20a46
parent:	a0dfde3cb3c4
author:	Richard Westhaver <ellis@rwest.io>
date:	Thu, 19 Sep 2024 23:23:02 -0400
permissions:	-rw-r--r--
description:	zstd completed (besides zdict), working on readline
     1 ;;; std/str.lisp --- String utilities
     2 
     3 ;;; Code:
     4 
     5 ;; (defvar sb-unicode-syms 
     6 ;;   '(words lines sentences whitespace-p uppercase lowercase titlecase
     7 ;;     word-break-class line-break-class sentence-break-class char-block
     8 ;;     cased-p uppercase-p lowercase-p titlecase-p casefold
     9 ;;     graphemes grapheme-break-class
    10 ;;     bidi-mirroring-glyph bidi-class
    11 ;;     normalize-string normalized-p default-ignorable-p
    12 ;;     confusable-p hex-digit-p mirrored-p alphabetic-p math-p
    13 ;;     decimal-value digit-value
    14 ;;     unicode< unicode> unicode= unicode-equal
    15 ;;     unicode<= unicode>=))
    16 (in-package :std/string)
    17 
    18 ;; (mapc (lambda (s) (export s)) sb-unicode-syms)
    19 ;; (reexport-from 
    20 ;;  :sb-unicode
    21 ;;  :include sb-unicode-syms)
    22 
    23 (defparameter *omit-nulls* nil)
    24 (defvar *whitespaces* (list #\Backspace #\Tab #\Linefeed #\Newline #\Vt #\Page
    25                             #\Return #\Space #\Rubout
    26                             #+sbcl #\Next-Line #-sbcl (code-char 133)
    27                             #\No-break_space)
    28   "On some implementations, linefeed and newline represent the same character (code).")
    29 
    30 (deftype string-designator ()
    31   "A string designator type. A string designator is either a string, a symbol,
    32 or a character."
    33   `(or symbol string character))
    34 
    35 (defun ssplit (separator s &key (omit-nulls *omit-nulls*))
    36   "Split s into substring by separator (cl-ppcre takes a regex, we do not).
    37 
    38    `limit' limits the number of elements returned (i.e. the string is
    39    split at most `limit' - 1 times)."
    40   ;; cl-ppcre:split doesn't return a null string if the separator appears at the end of s.
    41   (let* ((res (cl-ppcre:split separator s)))
    42     (if omit-nulls
    43         (remove-if (lambda (it) (sequence:emptyp it)) res)
    44         res)))
    45 
    46 (defun collapse-whitespaces (s)
    47   "Ensure there is only one space character between words.
    48   Remove newlines."
    49   (cl-ppcre:regex-replace-all "\\s+" s " "))
    50 
    51 (defun trim (s &key (char-bag *whitespaces*))
    52   "Removes all characters in `char-bag` (default: whitespaces) at the beginning and end of `s`.
    53    If supplied, char-bag has to be a sequence (e.g. string or list of characters).
    54 
    55    Examples: (trim \"  foo \") => \"foo\"
    56              (trim \"+-*foo-bar*-+\" :char-bag \"+-*\") => \"foo-bar\"
    57              (trim \"afood\" :char-bag (str:concat \"a\" \"d\")) => \"foo\""
    58   (when s
    59     (string-trim char-bag s)))
    60 
    61 ;;;  TODO 2023-08-27: camel snake kebab
    62 
    63 (defun make-template-parser (start-delimiter end-delimiter &key (ignore-case nil))
    64   "Returns a closure than can substitute variables
    65   delimited by \"start-delimiter\" and \"end-delimiter\"
    66   in a string, by the provided values."
    67   (check-type start-delimiter string)
    68   (check-type end-delimiter string)
    69   (when (or (string= start-delimiter "")
    70             (string= end-delimiter ""))
    71       (error 'simple-type-error
    72               :format-control "The empty string is not a valid delimiter."))
    73   (let ((start-len (length start-delimiter))
    74         (end-len (length end-delimiter))
    75         (test (if ignore-case
    76                   #'string-equal
    77                   #'string=)))
    78 
    79     (lambda (string values)
    80       (check-type string string)
    81       (unless (listp values)
    82         (error 'simple-type-error
    83                :format-control "values should be an association list"))
    84 
    85       (with-output-to-string (stream)
    86         (loop for prev = 0 then (+ j end-len)
    87               for i = (search start-delimiter string)
    88                       then (search start-delimiter string :start2 j)
    89               for j = (if i (search end-delimiter string :start2 i))
    90                       then (if i (search end-delimiter string :start2 i))
    91               while (and i j)
    92           do (write-string (subseq string prev i) stream)
    93              (let ((instance (rest (assoc (subseq string (+ i start-len) j)
    94                                           values
    95                                           :test test))))
    96                (if instance
    97                 (princ instance stream)
    98                 (write-string (subseq string i (+ j end-len)) stream)))
    99 
   100           finally (write-string (subseq string prev) stream))))))
   101 
   102 ;;; STRING-CASE
   103 ;;; Implementing an efficient string= case in Common Lisp
   104 ;;;
   105 ;;; 2015-11-15: Defknown don't have explicit-check in SBCL 1.3.0
   106 ;;;  Remove the declaration.  It's never useful the way we use
   107 ;;;  numeric-char=.
   108 ;;;
   109 ;;; 2015-11-15: Make this a real ASDF system for Xach
   110 ;;;  I copied the system definition from Quicklisp and mangled as
   111 ;;;  necessary.
   112 ;;;
   113 ;;; 2010-06-30: Tiny bugfix
   114 ;;;  Widen the type declarations inside cases to allow vectors that
   115 ;;;  have a length that's shorter than the total size (due to fill-
   116 ;;;  pointers).
   117 
   118 ;;;
   119 ;;;# Introduction
   120 ;;;
   121 ;;; In `<http://neverfriday.com/blog/?p=10>', OMouse asks how
   122 ;;; best to implement a `string= case' (in Scheme). I noted that
   123 ;;; naively iterating through the cases with `string=' at runtime
   124 ;;; is suboptimal. Seeing the problem as a simplistic pattern
   125 ;;; matching one makes an efficient solution obvious.
   126 ;;; Note that, unlike Haskell, both Scheme and CL have random-
   127 ;;; access on strings in O(1), something which I exploit to
   128 ;;; generate better code.
   129 ;;;
   130 ;;; This is also a pbook.el file (the pdf can be found at
   131 ;;; `<http://www.discontinuity.info/~pkhuong/string-case.pdf>' ).
   132 ;;; I'm new at this not-quite-illiterate programming thing, so
   133 ;;; please bear with me (: I'm also looking for comments on the
   134 ;;; formatting. I'm particularly iffy with the way keywords look
   135 ;;; like. It just looks really fuzzy when you're not really zoomed
   136 ;;; in (or reading it on paper).
   137 
   138 ;;; I usually don't use packages for throw-away code, but this looks
   139 ;;; like it could be useful to someone.
   140 
   141 ;;;# Some utility code
   142 
   143 (defun split-tree (list &key (test 'eql) (key 'identity))
   144   "Splits input list into sublists of elements
   145    whose elements are all such that (key element)
   146    are all test.
   147    It's assumed that test and key form an equality class.
   148    (This is similar to groupBy)"
   149   (when list
   150     (let* ((lists ())
   151            (cur-list (list (first list)))
   152            (cur-key  (funcall key (first list))))
   153       (dolist (elt (rest list) (nreverse (cons (nreverse cur-list)
   154                                                lists)))
   155         (let ((new-key (funcall key elt)))
   156           (if (funcall test cur-key new-key)
   157               (push elt cur-list)
   158               (progn
   159                 (push (nreverse cur-list) lists)
   160                 (setf cur-list (list elt)
   161                       cur-key  new-key))))))))
   162 
   163 (defun iota (n)
   164   (loop for i below n collect i))
   165 
   166 (defun hash-table->list (table &key (keep-keys t) (keep-values t))
   167   "Saves the keys and/or values in table to a list.
   168    As with hash table iterating functions, there is no
   169    implicit ordering."
   170   (let ((list ()))
   171     (maphash (cond ((and keep-keys
   172                          keep-values)
   173                     (lambda (k v)
   174                       (push (cons k v) list)))
   175                    (keep-keys
   176                     (lambda (k v)
   177                       (declare (ignore v))
   178                       (push k list)))
   179                    (keep-values
   180                     (lambda (k v)
   181                       (declare (ignore k))
   182                       (push v list))))
   183              table)
   184     list))
   185 
   186 (defun all-equal (list &key (key 'identity) (test 'eql))
   187   (if (or (null list)
   188           (null (rest list)))
   189       t
   190       (let ((first-key (funcall key (first list))))
   191         (every (lambda (element)
   192                  (funcall test first-key
   193                           (funcall key element)))
   194                (rest list)))))
   195 
   196 (defun split-at (list n)
   197   "Split list in k lists of n elements (or less for the last list)"
   198   (declare (type (and fixnum (integer (0))) n))
   199   (let ((lists    '())
   200         (cur-list '())
   201         (counter  0))
   202     (declare (type (and fixnum unsigned-byte) counter))
   203     (dolist (elt list (nreverse (if cur-list
   204                                     (cons (nreverse cur-list)
   205                                           lists)
   206                                     lists)))
   207       (push elt cur-list)
   208       (when (= (incf counter) n)
   209         (push (nreverse cur-list) lists)
   210         (setf cur-list '()
   211               counter   0)))))
   212 
   213 ;;;# The string matching compiler per se
   214 ;;;
   215 ;;; I use special variables here because I find that
   216 ;;; preferable to introducing noise everywhere to thread
   217 ;;; these values through all the calls, especially
   218 ;;; when `*no-match-form*' is only used at the very end.
   219 
   220 (defparameter *input-string* nil
   221   "Symbol of the variable holding the input string")
   222 
   223 (defparameter *no-match-form* nil
   224   "Form to insert when no match is found.")
   225 
   226 ;;; The basic idea of the pattern matching process here is
   227 ;;; to first discriminate with the input string's length;
   228 ;;; once that is done, it is very easy to safely use random
   229 ;;; access until only one candidate string (pattern) remains.
   230 ;;; However, even if we determine that only one case might be
   231 ;;; a candidate, it might still be possible for another string
   232 ;;; (not in the set of cases) to match the criteria. So we also
   233 ;;; have to make sure that *all* the indices match. A simple
   234 ;;; way to do this would be to emit the remaining checks at the
   235 ;;; every end, when only one candidate is left. However, that
   236 ;;; can result in a lot of duplicate code, and some useless
   237 ;;; work on mismatches. Instead, the code generator always
   238 ;;; tries to find (new) indices for which all the candidates
   239 ;;; left in the branch share the same character, and then emits
   240 ;;; a guard, checking the character at that index as soon as possible.
   241 
   242 ;;; In my experience, there are two main problems when writing
   243 ;;; pattern matchers: how to decide what to test for at each
   244 ;;; fork, and how to ensure the code won't explode exponentially.
   245 ;;; Luckily, for our rather restricted pattern language (equality
   246 ;;; on strings), patterns can't overlap, and it's possible to guarantee
   247 ;;; that no candidate will ever be possible in both branches of a
   248 ;;; fork.
   249 
   250 ;;; Due to the the latter guarantee, we have a simple fitness
   251 ;;; measure for tests: simply maximising the number of
   252 ;;; candidates in the smallest branch will make our search tree
   253 ;;; as balanced as possible. Of course, we don't know whether
   254 ;;; the subtrees will be balanced too, but I don't think it'll
   255 ;;; be much of an issue.
   256 
   257 ;;; Note that, if we had access, whether via annotations or profiling,
   258 ;;; to the probability of each case, the situation would be very
   259 ;;; different. In fact, on a pipelined machine where branch
   260 ;;; mispredictions are expensive, an unbalanced tree will yield
   261 ;;; better expected runtimes. There was a very interesting and rather
   262 ;;; sophisticated Google lecture on that topic on Google video
   263 ;;; (the speaker used markov chains to model dynamic predictors,
   264 ;;; for example), but I can't seem to find the URL.
   265 
   266 ;;; TODO: Find bounds on the size of the code!
   267 
   268 (defun find-best-split (strings to-check)
   269   "Iterate over all the indices left to check to find
   270    which index (and which character) to test for equality
   271    with, keeping the ones which result in the most balanced
   272    split."
   273   (flet ((evaluate-split (i char)
   274            "Just count all the matches and mismatches"
   275            (let ((=  0)
   276                  (/= 0))
   277              (dolist (string strings (min = /=))
   278                (if (eql (aref string i) char)
   279                    (incf =)
   280                    (incf /=)))))
   281          (uniquify-chars (chars)
   282            "Only keep one copy of each char in the list"
   283            (mapcar 'first (split-tree (sort chars 'char<) :test #'eql))))
   284     (let ((best-split 0)            ; maximise size of smallest branch
   285           (best-posn  nil)
   286           (best-char  nil))
   287       (dolist (i to-check (values best-posn best-char))
   288         (dolist (char (uniquify-chars (mapcar (lambda (string)
   289                                                 (aref string i))
   290                                               strings)))
   291           (let ((Z (evaluate-split i char)))
   292             (when (> Z best-split)
   293               (setf best-split Z
   294                     best-posn  i
   295                     best-char  char))))))))
   296 
   297 ;;; We sometimes have to execute sequences of checks for
   298 ;;; equality. The natural way to express this is via a
   299 ;;; sequence of checks, wrapped in an `and'. However, that
   300 ;;; translates to a sequence of conditional branches, predicated
   301 ;;; on very short computations. On (not so) modern architectures,
   302 ;;; it'll be faster to coalesce a sequence of such checks together
   303 ;;; as straightline code (e.g. via `or' of `xor'), and only branch
   304 ;;; at the very end. The code doesn't become much more complex,
   305 ;;; and benchmarks have shown it to be beneficial (giving a speed
   306 ;;; up of 2-5% for both predictable and unpredictable workloads,
   307 ;;; on a Core 2).
   308 
   309 ;;; Benchmarks (and experience) have shown that, instead of executing
   310 ;;; a cascade of comparison/conditional branch, it's slightly
   311 ;;; faster, both for predictable and unpredictable workloads,
   312 ;;; to `or' together a bunch of comparisons (e.g. `xor'). On a Core 2
   313 ;;; processor, it seems that doing so for sequences of around 4
   314 ;;; comparisons is the sweetspot. On perfectly predictable input,
   315 ;;; aborting early (on the first check) saves as much time as
   316 ;;; the 4 test/conditional branch add, compared to a sequence of
   317 ;;; `xor' and `or'. 
   318 
   319 ;;; Numeric char= abstracts out the xor check, and, on SBCL,
   320 ;;; is replaced by a short assembly sequence when the first
   321 ;;; argument is a constant. The declared return type is then
   322 ;;; wider than strictly necessary making it fit in a machine
   323 ;;; register, but not as a fixnum ensures that the compiler
   324 ;;; won't repeatedly convert the values to fixnums, when all
   325 ;;; we'll do is `or' them together and check for zero-ness.
   326 ;;; This function is the only place where the macro isn't
   327 ;;; generic over the elements stored in the cases. It shouldn't
   328 ;;; be too hard to implement a numeric-eql, which would
   329 ;;; restore genericity to the macro, while keeping the 
   330 ;;; speed-up.
   331 
   332 ;; (progn
   333 ;;   (defknown numeric-char= (character character)
   334 ;;       (unsigned-byte #. (1- sb-vm:n-machine-word-bits))
   335 ;;       (movable foldable flushable))
   336 
   337 ;;   (define-vop (numeric-char=)
   338 ;;     (:args (x :scs (sb-vm::character-reg sb-vm::character-stack)
   339 ;;               :target r
   340 ;;               :load-if (not (location= x r))))
   341 ;;     (:info y)
   342 ;;     (:arg-types (:constant character) character)
   343 ;;     (:results (r :scs (sb-vm::unsigned-reg)
   344 ;;                  :load-if (not (location= x r))))
   345 ;;     (:result-types sb-vm::unsigned-num)
   346 ;;     (:translate numeric-char=)
   347 ;;     (:policy :fast-safe)
   348 ;;     (:note "inline constant numeric-char=")
   349 ;;     (:generator 1
   350 ;;       (move r x)
   351 ;;       (sb-vm::inst #:xor r (char-code y)))))
   352 
   353 #+ (and sbcl (or x86 x86-64))
   354 (defun numeric-char= (x y)
   355   (declare (type character x y))
   356   (logxor (char-code x)
   357           (char-code y)))
   358 
   359 ;;; At each step, we may be able to find positions for which
   360 ;;; there can only be one character. If we emit the check for
   361 ;;; these positions as soon as possible, we avoid duplicating
   362 ;;; potentially a lot of code. Since benchmarks have shown
   363 ;;; it to be useful, this function implements the checks
   364 ;;; as a series of (zerop (logior (numeric-char= ...)...)),
   365 ;;; if there is more than one such check to emit.
   366 
   367 (defun emit-common-checks (strings to-check)
   368   (labels ((emit-char= (pairs)
   369              (mapcar (lambda (pair)
   370                        (destructuring-bind (posn . char)
   371                            pair
   372                          `(numeric-char= ,char
   373                                          (aref ,*input-string* ,posn))))
   374                      pairs))
   375            (emit-checking-form (common-chars)
   376              (when common-chars
   377                (let ((common-chars (sort common-chars '< :key 'car)))
   378                  #+ (and) `(and ,@(mapcar
   379                                    (lambda (chunk)
   380                                      (if (null (rest chunk))
   381                                          (destructuring-bind ((posn . char))
   382                                              chunk
   383                                            `(eql ,char
   384                                                  (aref ,*input-string* ,posn)))
   385                                          `(zerop
   386                                            (logior ,@(emit-char= chunk)))))
   387                                    (split-at common-chars 4)))
   388                  #+ (or) `(and ,@(mapcar
   389                                   (lambda (pair)
   390                                     (destructuring-bind (posn . char)
   391                                         pair
   392                                       `(eql ,char
   393                                             (aref ,*input-string* ,posn))))
   394                                   common-chars))))))
   395     (let ((common-chars  ())
   396           (left-to-check ()))
   397       (dolist (posn to-check (values (emit-checking-form common-chars)
   398                                      (nreverse           left-to-check)))
   399         (if (all-equal strings :key (lambda (string)
   400                                       (aref string posn)))
   401             (push (cons posn (aref (first strings) posn))
   402                   common-chars)
   403             (push posn left-to-check))))))
   404 
   405 ;;; The driving function: First, emit any test that is
   406 ;;; common to all the candidates. If there's only one
   407 ;;; candidate, then we just have to execute the body;
   408 ;;; if not, we look for the `best' test and emit the
   409 ;;; corresponding code: execute the test, and recurse
   410 ;;; on the candidates that match the test and on those
   411 ;;; that don't.
   412 
   413 (defun make-search-tree (strings bodies to-check)
   414   (multiple-value-bind (guard to-check)
   415       (emit-common-checks strings to-check)
   416     (if (null (rest strings))
   417         (progn
   418           (assert (null to-check)) ; there shouldn't be anything left to check
   419           (if guard
   420               `(if ,guard
   421                    (progn ,@(first bodies))
   422                    ,*no-match-form*)
   423               `(progn ,@(first bodies))))
   424         (multiple-value-bind (posn char)
   425             (find-best-split strings to-check)
   426           (assert posn) ; this can only happen if all strings are equal
   427           (let ((=strings  ())
   428                 (=bodies   ())
   429                 (/=strings ())
   430                 (/=bodies  ()))
   431             (loop
   432                for string in strings
   433                for body   in bodies
   434                do (if (eql char (aref string posn))
   435                       (progn
   436                         (push string =strings)
   437                         (push body   =bodies))
   438                       (progn
   439                         (push string /=strings)
   440                         (push body   /=bodies))))
   441             (let ((tree `(if (eql ,char (aref ,*input-string* ,posn))
   442                              ,(make-search-tree  =strings   =bodies
   443                                                  (remove posn to-check))
   444                              ,(make-search-tree /=strings /=bodies
   445                                                 to-check))))
   446               (if guard
   447                   `(if ,guard
   448                        ,tree
   449                        ,*no-match-form*)
   450                   tree)))))))
   451 
   452 ;;; Finally, we can glue it all together.
   453 ;;; To recapitulate, first, dispatch on string
   454 ;;; length, then execute a search tree for the
   455 ;;; few candidates left, and finally make sure
   456 ;;; the input string actually matches the one 
   457 ;;; candidate left at the leaf.
   458 
   459 (defun emit-string-case (cases input-var no-match)
   460   (flet ((case-string-length (x)
   461            (length (first x))))
   462     (let ((*input-string*  input-var)
   463           (*no-match-form* no-match)
   464           (cases-lists     (split-tree (sort cases '<
   465                                         :key #'case-string-length)
   466                                   :key #'case-string-length)))
   467       `(locally (declare (type vector ,input-var))
   468          (case (length ,input-var)
   469            ,@(loop for cases in cases-lists
   470                    for length = (case-string-length (first cases))
   471                    collect `((,length)
   472                              ;; arrays with fill pointers expose the total length
   473                              ;; in their type, not the position of the fill-pointer.
   474                              ;; The type below only applies to simple-arrays.
   475                              (locally (declare (type (or (not simple-array)
   476                                                          (simple-array * (,length)))
   477                                                      ,input-var))
   478                                ,(make-search-tree (mapcar 'first cases)
   479                                                   (mapcar 'rest  cases)
   480                                                   (iota length)))))
   481            (t ,no-match))))))
   482 
   483 ;;; Just wrapping the previous function in a macro,
   484 ;;; and adding some error checking (the rest of the code
   485 ;;; just assumes there won't be duplicate patterns).
   486 ;;; Note how we use a local function instead of passing
   487 ;;; the default form directly. This can save a lot on
   488 ;;; code size, especially when the default form is
   489 ;;; large.
   490 
   491 (defmacro string-case ((string &key (default '(error "No match")))
   492                        &body cases)
   493   "(string-case (string &key default)
   494      case*)
   495    case ::= string form*
   496           | t      form*
   497    Where t is the default case."
   498   (let ((cases-table (make-hash-table :test 'equal)))
   499     "Error checking cruft"
   500     (dolist (case cases)
   501       (assert (typep case '(cons (or string (eql t)))))
   502       (let ((other-case (gethash (first case) cases-table)))
   503         (if other-case
   504             (warn "Duplicate string-case cases: ~A -> ~A or ~A~%"
   505                   (first case)
   506                   (rest other-case)
   507                   (rest case))
   508             (setf (gethash (first case) cases-table)
   509                   (rest case)))))
   510     (let ((input-var    (gensym "INPUT"))
   511           (default-fn   (gensym "ON-ERROR"))
   512           (default-body (gethash t cases-table (list default))))
   513       `(let ((,input-var ,string))
   514          (flet ((,default-fn ()
   515                   ,@default-body))
   516            ,(emit-string-case (progn
   517                                 (remhash t cases-table)
   518                                 (hash-table->list cases-table))
   519                               input-var
   520                               `(,default-fn)))))))