# HG changeset patch
# User Richard Westhaver <ellis@rwest.io>
# Date 1726710486 14400
# Node ID 937a6f35404771168e61a2ad38d213689cc6c945
# Parent  b499d4bcfc397ff6c1c2e85987242aab9ee58975
zstd tests and macros

diff -r b499d4bcfc39 -r 937a6f354047 lisp/ffi/zstd/constants.lisp
--- a/lisp/ffi/zstd/constants.lisp	Tue Sep 17 22:19:19 2024 -0400
+++ b/lisp/ffi/zstd/constants.lisp	Wed Sep 18 21:48:06 2024 -0400
@@ -1,4 +1,4 @@
-("stddef.h" "zstd_errors.h" "zstd.h")
+("stddef.h" "zstd_errors.h" "zstd.h" "zdict.h")
 ((:integer +zstd-version-major+ "ZSTD_VERSION_MAJOR" t t)
  (:integer +zstd-version-minor+ "ZSTD_VERSION_MINOR" t t)
  (:integer +zstd-version-release+ "ZSTD_VERSION_RELEASE" t t)
diff -r b499d4bcfc39 -r 937a6f354047 lisp/ffi/zstd/dict.lisp
--- a/lisp/ffi/zstd/dict.lisp	Tue Sep 17 22:19:19 2024 -0400
+++ b/lisp/ffi/zstd/dict.lisp	Wed Sep 18 21:48:06 2024 -0400
@@ -2,8 +2,177 @@
 
 ;; 
 
+;;; Commentary:
+
+;; From zdict.h:
+#|
+ * Zstd dictionary builder
+ *
+ * FAQ
+ * ===
+ * Why should I use a dictionary?
+ * ------------------------------
+ *
+ * Zstd can use dictionaries to improve compression ratio of small data.
+ * Traditionally small files don't compress well because there is very little
+ * repetition in a single sample, since it is small. But, if you are compressing
+ * many similar files, like a bunch of JSON records that share the same
+ * structure, you can train a dictionary on ahead of time on some samples of
+ * these files. Then, zstd can use the dictionary to find repetitions that are
+ * present across samples. This can vastly improve compression ratio.
+ *
+ * When is a dictionary useful?
+ * ----------------------------
+ *
+ * Dictionaries are useful when compressing many small files that are similar.
+ * The larger a file is, the less benefit a dictionary will have. Generally,
+ * we don't expect dictionary compression to be effective past 100KB. And the
+ * smaller a file is, the more we would expect the dictionary to help.
+ *
+ * How do I use a dictionary?
+ * --------------------------
+ *
+ * Simply pass the dictionary to the zstd compressor with
+ * `ZSTD_CCtx_loadDictionary()`. The same dictionary must then be passed to
+ * the decompressor, using `ZSTD_DCtx_loadDictionary()`. There are other
+ * more advanced functions that allow selecting some options, see zstd.h for
+ * complete documentation.
+ *
+ * What is a zstd dictionary?
+ * --------------------------
+ *
+ * A zstd dictionary has two pieces: Its header, and its content. The header
+ * contains a magic number, the dictionary ID, and entropy tables. These
+ * entropy tables allow zstd to save on header costs in the compressed file,
+ * which really matters for small data. The content is just bytes, which are
+ * repeated content that is common across many samples.
+ *
+ * What is a raw content dictionary?
+ * ---------------------------------
+ *
+ * A raw content dictionary is just bytes. It doesn't have a zstd dictionary
+ * header, a dictionary ID, or entropy tables. Any buffer is a valid raw
+ * content dictionary.
+ *
+ * How do I train a dictionary?
+ * ----------------------------
+ *
+ * Gather samples from your use case. These samples should be similar to each
+ * other. If you have several use cases, you could try to train one dictionary
+ * per use case.
+ *
+ * Pass those samples to `ZDICT_trainFromBuffer()` and that will train your
+ * dictionary. There are a few advanced versions of this function, but this
+ * is a great starting point. If you want to further tune your dictionary
+ * you could try `ZDICT_optimizeTrainFromBuffer_cover()`. If that is too slow
+ * you can try `ZDICT_optimizeTrainFromBuffer_fastCover()`.
+ *
+ * If the dictionary training function fails, that is likely because you
+ * either passed too few samples, or a dictionary would not be effective
+ * for your data. Look at the messages that the dictionary trainer printed,
+ * if it doesn't say too few samples, then a dictionary would not be effective.
+ *
+ * How large should my dictionary be?
+ * ----------------------------------
+ *
+ * A reasonable dictionary size, the `dictBufferCapacity`, is about 100KB.
+ * The zstd CLI defaults to a 110KB dictionary. You likely don't need a
+ * dictionary larger than that. But, most use cases can get away with a
+ * smaller dictionary. The advanced dictionary builders can automatically
+ * shrink the dictionary for you, and select the smallest size that doesn't
+ * hurt compression ratio too much. See the `shrinkDict` parameter.
+ * A smaller dictionary can save memory, and potentially speed up
+ * compression.
+ *
+ * How many samples should I provide to the dictionary builder?
+ * ------------------------------------------------------------
+ *
+ * We generally recommend passing ~100x the size of the dictionary
+ * in samples. A few thousand should suffice. Having too few samples
+ * can hurt the dictionaries effectiveness. Having more samples will
+ * only improve the dictionaries effectiveness. But having too many
+ * samples can slow down the dictionary builder.
+ *
+ * How do I determine if a dictionary will be effective?
+ * -----------------------------------------------------
+ *
+ * Simply train a dictionary and try it out. You can use zstd's built in
+ * benchmarking tool to test the dictionary effectiveness.
+ *
+ *   # Benchmark levels 1-3 without a dictionary
+ *   zstd -b1e3 -r /path/to/my/files
+ *   # Benchmark levels 1-3 with a dictionary
+ *   zstd -b1e3 -r /path/to/my/files -D /path/to/my/dictionary
+ *
+ * When should I retrain a dictionary?
+ * -----------------------------------
+ *
+ * You should retrain a dictionary when its effectiveness drops. Dictionary
+ * effectiveness drops as the data you are compressing changes. Generally, we do
+ * expect dictionaries to "decay" over time, as your data changes, but the rate
+ * at which they decay depends on your use case. Internally, we regularly
+ * retrain dictionaries, and if the new dictionary performs significantly
+ * better than the old dictionary, we will ship the new dictionary.
+ *
+ * I have a raw content dictionary, how do I turn it into a zstd dictionary?
+ * -------------------------------------------------------------------------
+ *
+ * If you have a raw content dictionary, e.g. by manually constructing it, or
+ * using a third-party dictionary builder, you can turn it into a zstd
+ * dictionary by using `ZDICT_finalizeDictionary()`. You'll also have to
+ * provide some samples of the data. It will add the zstd header to the
+ * raw content, which contains a dictionary ID and entropy tables, which
+ * will improve compression ratio, and allow zstd to write the dictionary ID
+ * into the frame, if you so choose.
+ *
+ * Do I have to use zstd's dictionary builder?
+ * -------------------------------------------
+ *
+ * No! You can construct dictionary content however you please, it is just
+ * bytes. It will always be valid as a raw content dictionary. If you want
+ * a zstd dictionary, which can improve compression ratio, use
+ * `ZDICT_finalizeDictionary()`.
+ *
+ * What is the attack surface of a zstd dictionary?
+ * ------------------------------------------------
+ *
+ * Zstd is heavily fuzz tested, including loading fuzzed dictionaries, so
+ * zstd should never crash, or access out-of-bounds memory no matter what
+ * the dictionary is. However, if an attacker can control the dictionary
+ * during decompression, they can cause zstd to generate arbitrary bytes,
+ * just like if they controlled the compressed data.
+ *
+ ******************************************************************************/
+
+
+/*! ZDICT_trainFromBuffer():
+ *  Train a dictionary from an array of samples.
+ *  Redirect towards ZDICT_optimizeTrainFromBuffer_fastCover() single-threaded, with d=8, steps=4,
+ *  f=20, and accel=1.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *          or an error code, which can be tested with ZDICT_isError().
+ *  Note:  Dictionary training will fail if there are not enough samples to construct a
+ *         dictionary, or if most of the samples are too small (< 8 bytes being the lower limit).
+ *         If dictionary training fails, you should use zstd without a dictionary, as the dictionary
+ *         would've been ineffective anyways. If you believe your samples would benefit from a dictionary
+ *         please open an issue with details, and we can look into it.
+ *  Note: ZDICT_trainFromBuffer()'s memory usage is about 6 MB.
+ *  Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, though this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ */
+|#
 ;;; Code:
 (in-package :zstd)
+(deferror zstd-ddict-error (zstd-alien-error) ())
+(deferror zstd-cdict-error (zstd-alien-error)
+    ()
+    (:report (lambda (c s)
+               (format s "ZSTD CDict signalled error: ~A" (zstd-errorcode* (zstd-error-code c))))))
 
 (define-alien-enum (zstd-dict-content-type int)
                    :auto 0
@@ -114,3 +283,17 @@
 (define-alien-routine "ZSTD_getDictID_fromFrame" unsigned
   (src (* t))
   (src-size size-t))
+
+(define-alien-routine "ZSTD_estimatedDictSize" size-t (dict-size size-t) (dict-load-method zstd-dict-load-method))
+
+(defmacro with-zstd-cdict ((cv &key buffer size (level (zstd-defaultclevel))) &body body)
+  (let ((size (or size (length buffer))))
+    `(with-alien ((,cv (* zstd-cdict) (zstd-createcdict (cast (octets-to-alien ,buffer) (* t)) ,size ,level)))
+       (unwind-protect (progn ,@body)
+         (zstd-freecdict ,cv)))))
+
+(defmacro with-zstd-ddict ((dv &key buffer size) &body body)
+  (let ((size (or size (length buffer))))
+    `(with-alien ((,dv (* zstd-ddict) (zstd-createddict (cast (octets-to-alien ,buffer) (* t)) ,size)))
+       (unwind-protect (progn ,@body)
+         (zstd-freeddict ,dv)))))
diff -r b499d4bcfc39 -r 937a6f354047 lisp/ffi/zstd/pkg.lisp
--- a/lisp/ffi/zstd/pkg.lisp	Tue Sep 17 22:19:19 2024 -0400
+++ b/lisp/ffi/zstd/pkg.lisp	Wed Sep 18 21:48:06 2024 -0400
@@ -67,7 +67,13 @@
    :zstd-cstream :zstd-dstream :zstd-compressstream :zstd-decompressstream
    :zstd-compressstream2 :zstd-outbuffer :zstd-geterrorname :zstd-geterrorcode
    :zstdc :zstdd
-   :zstd-alien-error :zstd-dstream-error :zstd-cstream-error))
+   :zstd-alien-error :zstd-dstream-error :zstd-cstream-error
+   :with-zstd-streams
+   :with-zstd-buffers
+   :with-zstd-outbuffer
+   :with-zstd-inbuffer
+   :with-zstd-cdict
+   :with-zstd-ddict))
 
 (in-package :zstd)
 
diff -r b499d4bcfc39 -r 937a6f354047 lisp/ffi/zstd/stream.lisp
--- a/lisp/ffi/zstd/stream.lisp	Tue Sep 17 22:19:19 2024 -0400
+++ b/lisp/ffi/zstd/stream.lisp	Wed Sep 18 21:48:06 2024 -0400
@@ -56,6 +56,41 @@
 (define-alien-routine "ZSTD_DStreamInSize" size-t)
 (define-alien-routine "ZSTD_DStreamOutSize" size-t)
 
+(defmacro with-zstd-inbuffer ((iv &key src size pos) &body body)
+  `(with-alien ((,iv (* zstd-inbuffer) (allocate-zstd-inbuffer)))
+     (unwind-protect
+          (progn
+            ,@(when src `((setf (zstd-inbuffer-src ,iv) ,src)))
+            ,@(when size `((setf (zstd-inbuffer-size ,iv) ,size)))
+            ,@(when pos `((setf (zstd-inbuffer-pos ,iv) ,pos)))
+            ,@body)
+       (free-alien ,iv))))
+
+(defmacro with-zstd-outbuffer ((ov &key dst size pos) &body body)
+  `(with-alien ((,ov (* zstd-outbuffer) (allocate-zstd-outbuffer)))
+     (unwind-protect
+          (progn
+            ,@(when dst `((setf (zstd-outbuffer-dst ,ov) ,dst)))
+            ,@(when size `((setf (zstd-outbuffer-size ,ov) ,size)))
+            ,@(when pos `((setf (zstd-outbuffer-pos ,ov) ,pos)))
+            ,@body)
+       (free-alien ,ov))))
+  
+(defmacro with-zstd-buffers ((iv ov &key src src-size src-pos dst dst-size dst-pos) &body body)
+  `(with-alien ((,iv (* zstd-inbuffer) (allocate-zstd-inbuffer))
+                (,ov (* zstd-outbuffer) (allocate-zstd-outbuffer)))
+     (unwind-protect
+          (progn
+            ,@(when src `((setf (zstd-inbuffer-src ,iv) ,src)))
+            ,@(when src-size `((setf (zstd-inbuffer-size ,iv) ,src-size)))
+            ,@(when src-pos `((setf (zstd-inbuffer-pos ,iv) ,src-pos)))
+            ,@(when dst `((setf (zstd-outbuffer-dst ,ov) ,dst)))
+            ,@(when dst-size `((setf (zstd-outbuffer-size ,ov) ,dst-size)))
+            ,@(when dst-pos `((setf (zstd-outbuffer-pos ,ov) ,dst-pos)))
+            ,@body)
+       (free-alien ,iv)
+       (free-alien ,ov))))
+       
 (defmacro with-zstd-cstream ((cv &key (init t) (close t) (level (zstd-defaultclevel)) ) &body body)
   `(with-alien ((,cv (* zstd-cstream) (zstd-createcstream)))
      (unwind-protect
@@ -75,3 +110,19 @@
                                (zstd-dstream-error %dinit)))))
             ,@body)
        ,@(when close `((zstd-freedstream ,dv))))))
+
+(defmacro with-zstd-streams ((cv dv &key (init t) (close t) (level (zstd-defaultclevel))) &body body)
+  `(with-alien ((,cv (* zstd-cstream) (zstd-createcstream))
+                (,dv (* zstd-dstream) (zstd-createdstream)))
+     (unwind-protect
+          (progn
+            ,@(when init `((let ((%cinit (zstd-initcstream ,cv ,level))
+                                 (%dinit (zstd-initdstream ,dv)))
+                             ;; TODO 2024-09-18: 
+                             (unless (zerop (zstd-iserror %cinit))
+                               (zstd-cstream-error %cinit))
+                             (unless (zerop (zstd-iserror %cinit))
+                               (zstd-cstream-error %dinit)))))
+            ,@body)
+       ,@(when close `((zstd-freecstream ,cv)
+                       (zstd-freedstream ,dv))))))
diff -r b499d4bcfc39 -r 937a6f354047 lisp/ffi/zstd/tests.lisp
--- a/lisp/ffi/zstd/tests.lisp	Tue Sep 17 22:19:19 2024 -0400
+++ b/lisp/ffi/zstd/tests.lisp	Wed Sep 18 21:48:06 2024 -0400
@@ -45,14 +45,14 @@
   (is (< (zstd-cstreaminsize) (zstd-cstreamoutsize)))
   (with-alien ((in (* zstd-inbuffer) (zstd::allocate-zstd-inbuffer))
                (out (* zstd-outbuffer) (zstd::allocate-zstd-outbuffer)))
-    (let* ((str "this is a test yad ayd ay aya dayd ayd ada")
-           (len (length str)))
+    (let* ((str "this is a test yad ayd ay aya dayd ayd ada"))
       (setf (zstd::zstd-inbuffer-src in) (make-alien-string str)
-            (zstd::zstd-inbuffer-size in) len)
+            (zstd::zstd-inbuffer-size in) (zstd-cstreaminsize))
       (with-zstd-cstream (cs)
         (is (zerop (zstd::zstd-initcstream cs (zstd-defaultclevel))))
         (with-zstd-dstream (ds)
           ;; (setf (zstd::zstd-outbuffer-dst out) (make-alien-string str))
+          (setf (zstd::zstd-outbuffer-size out) (zstd-cstreamoutsize))
           (zstd-compressstream cs out in)
           (zstd::zstd-flushstream cs out)
           (zstd::zstd-endstream cs out)
@@ -62,10 +62,47 @@
                      c-string)
                str)))))))
 
-(deftest cstream ()
-  "Test streaming compression based on zstd.h HowTo guide.")
-  
 (deftest streaming2 ()
-  "Test the Zstd v2 Streaming API.")
-;; simple-dictionary
-;; builk-dictionary
+  "Test the Zstd v2 Streaming API."
+  (let ((test "test 1 2 3"))
+    (with-zstd-buffers (in out :src (make-alien-string test))
+      (with-zstd-streams (cs ds)
+        (zstd-compressstream2 cs out in 0)
+        (zstd-compressstream2 cs out in 1)
+        (is (zerop (zstd-iserror (zstd-compressstream2 cs out in 2))))
+        (zstd::zstd-flushstream cs out)
+        (is (zerop (zstd-iserror (zstd::zstd-endstream cs out))))
+        (zstd-decompressstream ds out in)
+        (is (string-equal 
+             (cast (zstd::zstd-inbuffer-src in) c-string)
+             test))))))
+
+(deftest simple-dictionary ()
+  (let ((test "test 1 2 3"))
+    (with-alien ((dict (* t))
+                 (dst (array (unsigned 8) 100)))
+      (with-zstd-buffers (in out :src (cast (make-alien-string test) (* t)) :dst (cast dst (* t)) :dst-size 100)
+        (is (= 100 (zstd::zstd-outbuffer-size out)))
+        (with-zstd-streams (cs ds)
+          (is 
+           (zerop
+            (zstd-iserror
+             (zstd::zstd-compress-usingdict 
+              cs 
+              (zstd::zstd-outbuffer-dst out) (zstd::zstd-outbuffer-size out) 
+              (zstd::zstd-inbuffer-src in) (zstd::zstd-inbuffer-size in)
+              dict (length test) (zstd-defaultclevel)))))
+          (is
+           (zerop
+            (zstd-iserror
+             (zstd::zstd-decompress-usingdict 
+              ds 
+              (zstd::zstd-outbuffer-dst out) (zstd::zstd-outbuffer-size out) 
+              (zstd::zstd-inbuffer-src in) (zstd::zstd-inbuffer-size in)
+              dict (length test))))))))))
+
+(deftest bulk-dictionary ()
+  (with-zstd-ddict (dd :buffer #(1 2 3))
+    (is (typep dd '(alien (* (struct zstd::zstd-ddict-s))))))
+  (with-zstd-cdict (cd :buffer #(4 5 6))
+    (is (typep cd '(alien (* (struct zstd::zstd-cdict-s)))))))
diff -r b499d4bcfc39 -r 937a6f354047 lisp/std/alien.lisp
--- a/lisp/std/alien.lisp	Tue Sep 17 22:19:19 2024 -0400
+++ b/lisp/std/alien.lisp	Wed Sep 18 21:48:06 2024 -0400
@@ -99,17 +99,26 @@
             (push c-string reversed-result)
             (return (nreverse reversed-result)))))))
 
-(defmacro clone-octets-to-alien (lispa aliena)
-  (with-gensyms (i)
-    `(loop for ,i from 0 below (length ,lispa)
-        do (setf (deref ,aliena ,i)
-                 (aref ,lispa ,i)))))
+(defun clone-octets-to-alien (lispa aliena)
+  (declare (optimize (speed 3)))
+  (loop for i from 0 below (length lispa)
+        do (setf (deref aliena i)
+                 (aref lispa i)))
+  aliena)
 
-(defmacro clone-octets-from-alien (aliena lispa len)
-  (with-gensyms (i)
-    `(loop for ,i from 0 below ,len
-           do (setf (aref ,lispa ,i)
-                 (deref ,aliena ,i)))))
+(defmacro octets-to-alien (lispa)
+  (with-gensyms (a)
+    `(with-alien ((,a (array (unsigned 8) ,(length lispa))))
+       (clone-octets-to-alien ,lispa ,a))))
+
+(defun clone-octets-from-alien (aliena lispa &optional len)
+  (declare (optimize (speed 3))
+           (array lispa))
+  (unless len (setf len (length lispa)))
+  (loop for i from 0 below len
+        do (setf (aref lispa i)
+                 (deref aliena i)))
+  lispa)
 
 (defun foreign-int-to-integer (buffer size)
   "Check SIZE of int BUFFER. return BUFFER."
diff -r b499d4bcfc39 -r 937a6f354047 lisp/std/pkg.lisp
--- a/lisp/std/pkg.lisp	Tue Sep 17 22:19:19 2024 -0400
+++ b/lisp/std/pkg.lisp	Wed Sep 18 21:48:06 2024 -0400
@@ -156,6 +156,7 @@
    :copy-c-string
    :clone-strings
    :clone-octets-to-alien
+   :octets-to-alien
    :clone-octets-from-alien
    :foreign-int-to-integer
    :foreign-int-to-bool
diff -r b499d4bcfc39 -r 937a6f354047 lisp/tests.lisp
--- a/lisp/tests.lisp	Tue Sep 17 22:19:19 2024 -0400
+++ b/lisp/tests.lisp	Wed Sep 18 21:48:06 2024 -0400
@@ -17,4 +17,4 @@
   (mapcar (lambda (x) (do-tests x force)) (remove *test-suite* *test-suite-list*)))
 
 (deftest all ()
-  (run-all-tests t))
+  (do-tests *test-suite-list*