1.1--- a/lisp/ffi/zstd/constants.lisp Tue Sep 17 22:19:19 2024 -0400
1.2+++ b/lisp/ffi/zstd/constants.lisp Wed Sep 18 21:48:06 2024 -0400
1.3@@ -1,4 +1,4 @@
1.4-("stddef.h" "zstd_errors.h" "zstd.h")
1.5+("stddef.h" "zstd_errors.h" "zstd.h" "zdict.h")
1.6 ((:integer +zstd-version-major+ "ZSTD_VERSION_MAJOR" t t)
1.7 (:integer +zstd-version-minor+ "ZSTD_VERSION_MINOR" t t)
1.8 (:integer +zstd-version-release+ "ZSTD_VERSION_RELEASE" t t)
2.1--- a/lisp/ffi/zstd/dict.lisp Tue Sep 17 22:19:19 2024 -0400
2.2+++ b/lisp/ffi/zstd/dict.lisp Wed Sep 18 21:48:06 2024 -0400
2.3@@ -2,8 +2,177 @@
2.4
2.5 ;;
2.6
2.7+;;; Commentary:
2.8+
2.9+;; From zdict.h:
2.10+#|
2.11+ * Zstd dictionary builder
2.12+ *
2.13+ * FAQ
2.14+ * ===
2.15+ * Why should I use a dictionary?
2.16+ * ------------------------------
2.17+ *
2.18+ * Zstd can use dictionaries to improve compression ratio of small data.
2.19+ * Traditionally small files don't compress well because there is very little
2.20+ * repetition in a single sample, since it is small. But, if you are compressing
2.21+ * many similar files, like a bunch of JSON records that share the same
2.22+ * structure, you can train a dictionary on ahead of time on some samples of
2.23+ * these files. Then, zstd can use the dictionary to find repetitions that are
2.24+ * present across samples. This can vastly improve compression ratio.
2.25+ *
2.26+ * When is a dictionary useful?
2.27+ * ----------------------------
2.28+ *
2.29+ * Dictionaries are useful when compressing many small files that are similar.
2.30+ * The larger a file is, the less benefit a dictionary will have. Generally,
2.31+ * we don't expect dictionary compression to be effective past 100KB. And the
2.32+ * smaller a file is, the more we would expect the dictionary to help.
2.33+ *
2.34+ * How do I use a dictionary?
2.35+ * --------------------------
2.36+ *
2.37+ * Simply pass the dictionary to the zstd compressor with
2.38+ * `ZSTD_CCtx_loadDictionary()`. The same dictionary must then be passed to
2.39+ * the decompressor, using `ZSTD_DCtx_loadDictionary()`. There are other
2.40+ * more advanced functions that allow selecting some options, see zstd.h for
2.41+ * complete documentation.
2.42+ *
2.43+ * What is a zstd dictionary?
2.44+ * --------------------------
2.45+ *
2.46+ * A zstd dictionary has two pieces: Its header, and its content. The header
2.47+ * contains a magic number, the dictionary ID, and entropy tables. These
2.48+ * entropy tables allow zstd to save on header costs in the compressed file,
2.49+ * which really matters for small data. The content is just bytes, which are
2.50+ * repeated content that is common across many samples.
2.51+ *
2.52+ * What is a raw content dictionary?
2.53+ * ---------------------------------
2.54+ *
2.55+ * A raw content dictionary is just bytes. It doesn't have a zstd dictionary
2.56+ * header, a dictionary ID, or entropy tables. Any buffer is a valid raw
2.57+ * content dictionary.
2.58+ *
2.59+ * How do I train a dictionary?
2.60+ * ----------------------------
2.61+ *
2.62+ * Gather samples from your use case. These samples should be similar to each
2.63+ * other. If you have several use cases, you could try to train one dictionary
2.64+ * per use case.
2.65+ *
2.66+ * Pass those samples to `ZDICT_trainFromBuffer()` and that will train your
2.67+ * dictionary. There are a few advanced versions of this function, but this
2.68+ * is a great starting point. If you want to further tune your dictionary
2.69+ * you could try `ZDICT_optimizeTrainFromBuffer_cover()`. If that is too slow
2.70+ * you can try `ZDICT_optimizeTrainFromBuffer_fastCover()`.
2.71+ *
2.72+ * If the dictionary training function fails, that is likely because you
2.73+ * either passed too few samples, or a dictionary would not be effective
2.74+ * for your data. Look at the messages that the dictionary trainer printed,
2.75+ * if it doesn't say too few samples, then a dictionary would not be effective.
2.76+ *
2.77+ * How large should my dictionary be?
2.78+ * ----------------------------------
2.79+ *
2.80+ * A reasonable dictionary size, the `dictBufferCapacity`, is about 100KB.
2.81+ * The zstd CLI defaults to a 110KB dictionary. You likely don't need a
2.82+ * dictionary larger than that. But, most use cases can get away with a
2.83+ * smaller dictionary. The advanced dictionary builders can automatically
2.84+ * shrink the dictionary for you, and select the smallest size that doesn't
2.85+ * hurt compression ratio too much. See the `shrinkDict` parameter.
2.86+ * A smaller dictionary can save memory, and potentially speed up
2.87+ * compression.
2.88+ *
2.89+ * How many samples should I provide to the dictionary builder?
2.90+ * ------------------------------------------------------------
2.91+ *
2.92+ * We generally recommend passing ~100x the size of the dictionary
2.93+ * in samples. A few thousand should suffice. Having too few samples
2.94+ * can hurt the dictionaries effectiveness. Having more samples will
2.95+ * only improve the dictionaries effectiveness. But having too many
2.96+ * samples can slow down the dictionary builder.
2.97+ *
2.98+ * How do I determine if a dictionary will be effective?
2.99+ * -----------------------------------------------------
2.100+ *
2.101+ * Simply train a dictionary and try it out. You can use zstd's built in
2.102+ * benchmarking tool to test the dictionary effectiveness.
2.103+ *
2.104+ * # Benchmark levels 1-3 without a dictionary
2.105+ * zstd -b1e3 -r /path/to/my/files
2.106+ * # Benchmark levels 1-3 with a dictionary
2.107+ * zstd -b1e3 -r /path/to/my/files -D /path/to/my/dictionary
2.108+ *
2.109+ * When should I retrain a dictionary?
2.110+ * -----------------------------------
2.111+ *
2.112+ * You should retrain a dictionary when its effectiveness drops. Dictionary
2.113+ * effectiveness drops as the data you are compressing changes. Generally, we do
2.114+ * expect dictionaries to "decay" over time, as your data changes, but the rate
2.115+ * at which they decay depends on your use case. Internally, we regularly
2.116+ * retrain dictionaries, and if the new dictionary performs significantly
2.117+ * better than the old dictionary, we will ship the new dictionary.
2.118+ *
2.119+ * I have a raw content dictionary, how do I turn it into a zstd dictionary?
2.120+ * -------------------------------------------------------------------------
2.121+ *
2.122+ * If you have a raw content dictionary, e.g. by manually constructing it, or
2.123+ * using a third-party dictionary builder, you can turn it into a zstd
2.124+ * dictionary by using `ZDICT_finalizeDictionary()`. You'll also have to
2.125+ * provide some samples of the data. It will add the zstd header to the
2.126+ * raw content, which contains a dictionary ID and entropy tables, which
2.127+ * will improve compression ratio, and allow zstd to write the dictionary ID
2.128+ * into the frame, if you so choose.
2.129+ *
2.130+ * Do I have to use zstd's dictionary builder?
2.131+ * -------------------------------------------
2.132+ *
2.133+ * No! You can construct dictionary content however you please, it is just
2.134+ * bytes. It will always be valid as a raw content dictionary. If you want
2.135+ * a zstd dictionary, which can improve compression ratio, use
2.136+ * `ZDICT_finalizeDictionary()`.
2.137+ *
2.138+ * What is the attack surface of a zstd dictionary?
2.139+ * ------------------------------------------------
2.140+ *
2.141+ * Zstd is heavily fuzz tested, including loading fuzzed dictionaries, so
2.142+ * zstd should never crash, or access out-of-bounds memory no matter what
2.143+ * the dictionary is. However, if an attacker can control the dictionary
2.144+ * during decompression, they can cause zstd to generate arbitrary bytes,
2.145+ * just like if they controlled the compressed data.
2.146+ *
2.147+ ******************************************************************************/
2.148+
2.149+
2.150+/*! ZDICT_trainFromBuffer():
2.151+ * Train a dictionary from an array of samples.
2.152+ * Redirect towards ZDICT_optimizeTrainFromBuffer_fastCover() single-threaded, with d=8, steps=4,
2.153+ * f=20, and accel=1.
2.154+ * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
2.155+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
2.156+ * The resulting dictionary will be saved into `dictBuffer`.
2.157+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
2.158+ * or an error code, which can be tested with ZDICT_isError().
2.159+ * Note: Dictionary training will fail if there are not enough samples to construct a
2.160+ * dictionary, or if most of the samples are too small (< 8 bytes being the lower limit).
2.161+ * If dictionary training fails, you should use zstd without a dictionary, as the dictionary
2.162+ * would've been ineffective anyways. If you believe your samples would benefit from a dictionary
2.163+ * please open an issue with details, and we can look into it.
2.164+ * Note: ZDICT_trainFromBuffer()'s memory usage is about 6 MB.
2.165+ * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
2.166+ * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
2.167+ * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
2.168+ * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
2.169+ */
2.170+|#
2.171 ;;; Code:
2.172 (in-package :zstd)
2.173+(deferror zstd-ddict-error (zstd-alien-error) ())
2.174+(deferror zstd-cdict-error (zstd-alien-error)
2.175+ ()
2.176+ (:report (lambda (c s)
2.177+ (format s "ZSTD CDict signalled error: ~A" (zstd-errorcode* (zstd-error-code c))))))
2.178
2.179 (define-alien-enum (zstd-dict-content-type int)
2.180 :auto 0
2.181@@ -114,3 +283,17 @@
2.182 (define-alien-routine "ZSTD_getDictID_fromFrame" unsigned
2.183 (src (* t))
2.184 (src-size size-t))
2.185+
2.186+(define-alien-routine "ZSTD_estimatedDictSize" size-t (dict-size size-t) (dict-load-method zstd-dict-load-method))
2.187+
2.188+(defmacro with-zstd-cdict ((cv &key buffer size (level (zstd-defaultclevel))) &body body)
2.189+ (let ((size (or size (length buffer))))
2.190+ `(with-alien ((,cv (* zstd-cdict) (zstd-createcdict (cast (octets-to-alien ,buffer) (* t)) ,size ,level)))
2.191+ (unwind-protect (progn ,@body)
2.192+ (zstd-freecdict ,cv)))))
2.193+
2.194+(defmacro with-zstd-ddict ((dv &key buffer size) &body body)
2.195+ (let ((size (or size (length buffer))))
2.196+ `(with-alien ((,dv (* zstd-ddict) (zstd-createddict (cast (octets-to-alien ,buffer) (* t)) ,size)))
2.197+ (unwind-protect (progn ,@body)
2.198+ (zstd-freeddict ,dv)))))
3.1--- a/lisp/ffi/zstd/pkg.lisp Tue Sep 17 22:19:19 2024 -0400
3.2+++ b/lisp/ffi/zstd/pkg.lisp Wed Sep 18 21:48:06 2024 -0400
3.3@@ -67,7 +67,13 @@
3.4 :zstd-cstream :zstd-dstream :zstd-compressstream :zstd-decompressstream
3.5 :zstd-compressstream2 :zstd-outbuffer :zstd-geterrorname :zstd-geterrorcode
3.6 :zstdc :zstdd
3.7- :zstd-alien-error :zstd-dstream-error :zstd-cstream-error))
3.8+ :zstd-alien-error :zstd-dstream-error :zstd-cstream-error
3.9+ :with-zstd-streams
3.10+ :with-zstd-buffers
3.11+ :with-zstd-outbuffer
3.12+ :with-zstd-inbuffer
3.13+ :with-zstd-cdict
3.14+ :with-zstd-ddict))
3.15
3.16 (in-package :zstd)
3.17
4.1--- a/lisp/ffi/zstd/stream.lisp Tue Sep 17 22:19:19 2024 -0400
4.2+++ b/lisp/ffi/zstd/stream.lisp Wed Sep 18 21:48:06 2024 -0400
4.3@@ -56,6 +56,41 @@
4.4 (define-alien-routine "ZSTD_DStreamInSize" size-t)
4.5 (define-alien-routine "ZSTD_DStreamOutSize" size-t)
4.6
4.7+(defmacro with-zstd-inbuffer ((iv &key src size pos) &body body)
4.8+ `(with-alien ((,iv (* zstd-inbuffer) (allocate-zstd-inbuffer)))
4.9+ (unwind-protect
4.10+ (progn
4.11+ ,@(when src `((setf (zstd-inbuffer-src ,iv) ,src)))
4.12+ ,@(when size `((setf (zstd-inbuffer-size ,iv) ,size)))
4.13+ ,@(when pos `((setf (zstd-inbuffer-pos ,iv) ,pos)))
4.14+ ,@body)
4.15+ (free-alien ,iv))))
4.16+
4.17+(defmacro with-zstd-outbuffer ((ov &key dst size pos) &body body)
4.18+ `(with-alien ((,ov (* zstd-outbuffer) (allocate-zstd-outbuffer)))
4.19+ (unwind-protect
4.20+ (progn
4.21+ ,@(when dst `((setf (zstd-outbuffer-dst ,ov) ,dst)))
4.22+ ,@(when size `((setf (zstd-outbuffer-size ,ov) ,size)))
4.23+ ,@(when pos `((setf (zstd-outbuffer-pos ,ov) ,pos)))
4.24+ ,@body)
4.25+ (free-alien ,ov))))
4.26+
4.27+(defmacro with-zstd-buffers ((iv ov &key src src-size src-pos dst dst-size dst-pos) &body body)
4.28+ `(with-alien ((,iv (* zstd-inbuffer) (allocate-zstd-inbuffer))
4.29+ (,ov (* zstd-outbuffer) (allocate-zstd-outbuffer)))
4.30+ (unwind-protect
4.31+ (progn
4.32+ ,@(when src `((setf (zstd-inbuffer-src ,iv) ,src)))
4.33+ ,@(when src-size `((setf (zstd-inbuffer-size ,iv) ,src-size)))
4.34+ ,@(when src-pos `((setf (zstd-inbuffer-pos ,iv) ,src-pos)))
4.35+ ,@(when dst `((setf (zstd-outbuffer-dst ,ov) ,dst)))
4.36+ ,@(when dst-size `((setf (zstd-outbuffer-size ,ov) ,dst-size)))
4.37+ ,@(when dst-pos `((setf (zstd-outbuffer-pos ,ov) ,dst-pos)))
4.38+ ,@body)
4.39+ (free-alien ,iv)
4.40+ (free-alien ,ov))))
4.41+
4.42 (defmacro with-zstd-cstream ((cv &key (init t) (close t) (level (zstd-defaultclevel)) ) &body body)
4.43 `(with-alien ((,cv (* zstd-cstream) (zstd-createcstream)))
4.44 (unwind-protect
4.45@@ -75,3 +110,19 @@
4.46 (zstd-dstream-error %dinit)))))
4.47 ,@body)
4.48 ,@(when close `((zstd-freedstream ,dv))))))
4.49+
4.50+(defmacro with-zstd-streams ((cv dv &key (init t) (close t) (level (zstd-defaultclevel))) &body body)
4.51+ `(with-alien ((,cv (* zstd-cstream) (zstd-createcstream))
4.52+ (,dv (* zstd-dstream) (zstd-createdstream)))
4.53+ (unwind-protect
4.54+ (progn
4.55+ ,@(when init `((let ((%cinit (zstd-initcstream ,cv ,level))
4.56+ (%dinit (zstd-initdstream ,dv)))
4.57+ ;; TODO 2024-09-18:
4.58+ (unless (zerop (zstd-iserror %cinit))
4.59+ (zstd-cstream-error %cinit))
4.60+ (unless (zerop (zstd-iserror %cinit))
4.61+ (zstd-cstream-error %dinit)))))
4.62+ ,@body)
4.63+ ,@(when close `((zstd-freecstream ,cv)
4.64+ (zstd-freedstream ,dv))))))
5.1--- a/lisp/ffi/zstd/tests.lisp Tue Sep 17 22:19:19 2024 -0400
5.2+++ b/lisp/ffi/zstd/tests.lisp Wed Sep 18 21:48:06 2024 -0400
5.3@@ -45,14 +45,14 @@
5.4 (is (< (zstd-cstreaminsize) (zstd-cstreamoutsize)))
5.5 (with-alien ((in (* zstd-inbuffer) (zstd::allocate-zstd-inbuffer))
5.6 (out (* zstd-outbuffer) (zstd::allocate-zstd-outbuffer)))
5.7- (let* ((str "this is a test yad ayd ay aya dayd ayd ada")
5.8- (len (length str)))
5.9+ (let* ((str "this is a test yad ayd ay aya dayd ayd ada"))
5.10 (setf (zstd::zstd-inbuffer-src in) (make-alien-string str)
5.11- (zstd::zstd-inbuffer-size in) len)
5.12+ (zstd::zstd-inbuffer-size in) (zstd-cstreaminsize))
5.13 (with-zstd-cstream (cs)
5.14 (is (zerop (zstd::zstd-initcstream cs (zstd-defaultclevel))))
5.15 (with-zstd-dstream (ds)
5.16 ;; (setf (zstd::zstd-outbuffer-dst out) (make-alien-string str))
5.17+ (setf (zstd::zstd-outbuffer-size out) (zstd-cstreamoutsize))
5.18 (zstd-compressstream cs out in)
5.19 (zstd::zstd-flushstream cs out)
5.20 (zstd::zstd-endstream cs out)
5.21@@ -62,10 +62,47 @@
5.22 c-string)
5.23 str)))))))
5.24
5.25-(deftest cstream ()
5.26- "Test streaming compression based on zstd.h HowTo guide.")
5.27-
5.28 (deftest streaming2 ()
5.29- "Test the Zstd v2 Streaming API.")
5.30-;; simple-dictionary
5.31-;; builk-dictionary
5.32+ "Test the Zstd v2 Streaming API."
5.33+ (let ((test "test 1 2 3"))
5.34+ (with-zstd-buffers (in out :src (make-alien-string test))
5.35+ (with-zstd-streams (cs ds)
5.36+ (zstd-compressstream2 cs out in 0)
5.37+ (zstd-compressstream2 cs out in 1)
5.38+ (is (zerop (zstd-iserror (zstd-compressstream2 cs out in 2))))
5.39+ (zstd::zstd-flushstream cs out)
5.40+ (is (zerop (zstd-iserror (zstd::zstd-endstream cs out))))
5.41+ (zstd-decompressstream ds out in)
5.42+ (is (string-equal
5.43+ (cast (zstd::zstd-inbuffer-src in) c-string)
5.44+ test))))))
5.45+
5.46+(deftest simple-dictionary ()
5.47+ (let ((test "test 1 2 3"))
5.48+ (with-alien ((dict (* t))
5.49+ (dst (array (unsigned 8) 100)))
5.50+ (with-zstd-buffers (in out :src (cast (make-alien-string test) (* t)) :dst (cast dst (* t)) :dst-size 100)
5.51+ (is (= 100 (zstd::zstd-outbuffer-size out)))
5.52+ (with-zstd-streams (cs ds)
5.53+ (is
5.54+ (zerop
5.55+ (zstd-iserror
5.56+ (zstd::zstd-compress-usingdict
5.57+ cs
5.58+ (zstd::zstd-outbuffer-dst out) (zstd::zstd-outbuffer-size out)
5.59+ (zstd::zstd-inbuffer-src in) (zstd::zstd-inbuffer-size in)
5.60+ dict (length test) (zstd-defaultclevel)))))
5.61+ (is
5.62+ (zerop
5.63+ (zstd-iserror
5.64+ (zstd::zstd-decompress-usingdict
5.65+ ds
5.66+ (zstd::zstd-outbuffer-dst out) (zstd::zstd-outbuffer-size out)
5.67+ (zstd::zstd-inbuffer-src in) (zstd::zstd-inbuffer-size in)
5.68+ dict (length test))))))))))
5.69+
5.70+(deftest bulk-dictionary ()
5.71+ (with-zstd-ddict (dd :buffer #(1 2 3))
5.72+ (is (typep dd '(alien (* (struct zstd::zstd-ddict-s))))))
5.73+ (with-zstd-cdict (cd :buffer #(4 5 6))
5.74+ (is (typep cd '(alien (* (struct zstd::zstd-cdict-s)))))))
6.1--- a/lisp/std/alien.lisp Tue Sep 17 22:19:19 2024 -0400
6.2+++ b/lisp/std/alien.lisp Wed Sep 18 21:48:06 2024 -0400
6.3@@ -99,17 +99,26 @@
6.4 (push c-string reversed-result)
6.5 (return (nreverse reversed-result)))))))
6.6
6.7-(defmacro clone-octets-to-alien (lispa aliena)
6.8- (with-gensyms (i)
6.9- `(loop for ,i from 0 below (length ,lispa)
6.10- do (setf (deref ,aliena ,i)
6.11- (aref ,lispa ,i)))))
6.12+(defun clone-octets-to-alien (lispa aliena)
6.13+ (declare (optimize (speed 3)))
6.14+ (loop for i from 0 below (length lispa)
6.15+ do (setf (deref aliena i)
6.16+ (aref lispa i)))
6.17+ aliena)
6.18
6.19-(defmacro clone-octets-from-alien (aliena lispa len)
6.20- (with-gensyms (i)
6.21- `(loop for ,i from 0 below ,len
6.22- do (setf (aref ,lispa ,i)
6.23- (deref ,aliena ,i)))))
6.24+(defmacro octets-to-alien (lispa)
6.25+ (with-gensyms (a)
6.26+ `(with-alien ((,a (array (unsigned 8) ,(length lispa))))
6.27+ (clone-octets-to-alien ,lispa ,a))))
6.28+
6.29+(defun clone-octets-from-alien (aliena lispa &optional len)
6.30+ (declare (optimize (speed 3))
6.31+ (array lispa))
6.32+ (unless len (setf len (length lispa)))
6.33+ (loop for i from 0 below len
6.34+ do (setf (aref lispa i)
6.35+ (deref aliena i)))
6.36+ lispa)
6.37
6.38 (defun foreign-int-to-integer (buffer size)
6.39 "Check SIZE of int BUFFER. return BUFFER."
7.1--- a/lisp/std/pkg.lisp Tue Sep 17 22:19:19 2024 -0400
7.2+++ b/lisp/std/pkg.lisp Wed Sep 18 21:48:06 2024 -0400
7.3@@ -156,6 +156,7 @@
7.4 :copy-c-string
7.5 :clone-strings
7.6 :clone-octets-to-alien
7.7+ :octets-to-alien
7.8 :clone-octets-from-alien
7.9 :foreign-int-to-integer
7.10 :foreign-int-to-bool
8.1--- a/lisp/tests.lisp Tue Sep 17 22:19:19 2024 -0400
8.2+++ b/lisp/tests.lisp Wed Sep 18 21:48:06 2024 -0400
8.3@@ -17,4 +17,4 @@
8.4 (mapcar (lambda (x) (do-tests x force)) (remove *test-suite* *test-suite-list*)))
8.5
8.6 (deftest all ()
8.7- (run-all-tests t))
8.8+ (do-tests *test-suite-list*