# HG changeset patch # User Richard Westhaver # Date 1726710486 14400 # Node ID 937a6f35404771168e61a2ad38d213689cc6c945 # Parent b499d4bcfc397ff6c1c2e85987242aab9ee58975 zstd tests and macros diff -r b499d4bcfc39 -r 937a6f354047 lisp/ffi/zstd/constants.lisp --- a/lisp/ffi/zstd/constants.lisp Tue Sep 17 22:19:19 2024 -0400 +++ b/lisp/ffi/zstd/constants.lisp Wed Sep 18 21:48:06 2024 -0400 @@ -1,4 +1,4 @@ -("stddef.h" "zstd_errors.h" "zstd.h") +("stddef.h" "zstd_errors.h" "zstd.h" "zdict.h") ((:integer +zstd-version-major+ "ZSTD_VERSION_MAJOR" t t) (:integer +zstd-version-minor+ "ZSTD_VERSION_MINOR" t t) (:integer +zstd-version-release+ "ZSTD_VERSION_RELEASE" t t) diff -r b499d4bcfc39 -r 937a6f354047 lisp/ffi/zstd/dict.lisp --- a/lisp/ffi/zstd/dict.lisp Tue Sep 17 22:19:19 2024 -0400 +++ b/lisp/ffi/zstd/dict.lisp Wed Sep 18 21:48:06 2024 -0400 @@ -2,8 +2,177 @@ ;; +;;; Commentary: + +;; From zdict.h: +#| + * Zstd dictionary builder + * + * FAQ + * === + * Why should I use a dictionary? + * ------------------------------ + * + * Zstd can use dictionaries to improve compression ratio of small data. + * Traditionally small files don't compress well because there is very little + * repetition in a single sample, since it is small. But, if you are compressing + * many similar files, like a bunch of JSON records that share the same + * structure, you can train a dictionary on ahead of time on some samples of + * these files. Then, zstd can use the dictionary to find repetitions that are + * present across samples. This can vastly improve compression ratio. + * + * When is a dictionary useful? + * ---------------------------- + * + * Dictionaries are useful when compressing many small files that are similar. + * The larger a file is, the less benefit a dictionary will have. Generally, + * we don't expect dictionary compression to be effective past 100KB. And the + * smaller a file is, the more we would expect the dictionary to help. + * + * How do I use a dictionary? + * -------------------------- + * + * Simply pass the dictionary to the zstd compressor with + * `ZSTD_CCtx_loadDictionary()`. The same dictionary must then be passed to + * the decompressor, using `ZSTD_DCtx_loadDictionary()`. There are other + * more advanced functions that allow selecting some options, see zstd.h for + * complete documentation. + * + * What is a zstd dictionary? + * -------------------------- + * + * A zstd dictionary has two pieces: Its header, and its content. The header + * contains a magic number, the dictionary ID, and entropy tables. These + * entropy tables allow zstd to save on header costs in the compressed file, + * which really matters for small data. The content is just bytes, which are + * repeated content that is common across many samples. + * + * What is a raw content dictionary? + * --------------------------------- + * + * A raw content dictionary is just bytes. It doesn't have a zstd dictionary + * header, a dictionary ID, or entropy tables. Any buffer is a valid raw + * content dictionary. + * + * How do I train a dictionary? + * ---------------------------- + * + * Gather samples from your use case. These samples should be similar to each + * other. If you have several use cases, you could try to train one dictionary + * per use case. + * + * Pass those samples to `ZDICT_trainFromBuffer()` and that will train your + * dictionary. There are a few advanced versions of this function, but this + * is a great starting point. If you want to further tune your dictionary + * you could try `ZDICT_optimizeTrainFromBuffer_cover()`. If that is too slow + * you can try `ZDICT_optimizeTrainFromBuffer_fastCover()`. + * + * If the dictionary training function fails, that is likely because you + * either passed too few samples, or a dictionary would not be effective + * for your data. Look at the messages that the dictionary trainer printed, + * if it doesn't say too few samples, then a dictionary would not be effective. + * + * How large should my dictionary be? + * ---------------------------------- + * + * A reasonable dictionary size, the `dictBufferCapacity`, is about 100KB. + * The zstd CLI defaults to a 110KB dictionary. You likely don't need a + * dictionary larger than that. But, most use cases can get away with a + * smaller dictionary. The advanced dictionary builders can automatically + * shrink the dictionary for you, and select the smallest size that doesn't + * hurt compression ratio too much. See the `shrinkDict` parameter. + * A smaller dictionary can save memory, and potentially speed up + * compression. + * + * How many samples should I provide to the dictionary builder? + * ------------------------------------------------------------ + * + * We generally recommend passing ~100x the size of the dictionary + * in samples. A few thousand should suffice. Having too few samples + * can hurt the dictionaries effectiveness. Having more samples will + * only improve the dictionaries effectiveness. But having too many + * samples can slow down the dictionary builder. + * + * How do I determine if a dictionary will be effective? + * ----------------------------------------------------- + * + * Simply train a dictionary and try it out. You can use zstd's built in + * benchmarking tool to test the dictionary effectiveness. + * + * # Benchmark levels 1-3 without a dictionary + * zstd -b1e3 -r /path/to/my/files + * # Benchmark levels 1-3 with a dictionary + * zstd -b1e3 -r /path/to/my/files -D /path/to/my/dictionary + * + * When should I retrain a dictionary? + * ----------------------------------- + * + * You should retrain a dictionary when its effectiveness drops. Dictionary + * effectiveness drops as the data you are compressing changes. Generally, we do + * expect dictionaries to "decay" over time, as your data changes, but the rate + * at which they decay depends on your use case. Internally, we regularly + * retrain dictionaries, and if the new dictionary performs significantly + * better than the old dictionary, we will ship the new dictionary. + * + * I have a raw content dictionary, how do I turn it into a zstd dictionary? + * ------------------------------------------------------------------------- + * + * If you have a raw content dictionary, e.g. by manually constructing it, or + * using a third-party dictionary builder, you can turn it into a zstd + * dictionary by using `ZDICT_finalizeDictionary()`. You'll also have to + * provide some samples of the data. It will add the zstd header to the + * raw content, which contains a dictionary ID and entropy tables, which + * will improve compression ratio, and allow zstd to write the dictionary ID + * into the frame, if you so choose. + * + * Do I have to use zstd's dictionary builder? + * ------------------------------------------- + * + * No! You can construct dictionary content however you please, it is just + * bytes. It will always be valid as a raw content dictionary. If you want + * a zstd dictionary, which can improve compression ratio, use + * `ZDICT_finalizeDictionary()`. + * + * What is the attack surface of a zstd dictionary? + * ------------------------------------------------ + * + * Zstd is heavily fuzz tested, including loading fuzzed dictionaries, so + * zstd should never crash, or access out-of-bounds memory no matter what + * the dictionary is. However, if an attacker can control the dictionary + * during decompression, they can cause zstd to generate arbitrary bytes, + * just like if they controlled the compressed data. + * + ******************************************************************************/ + + +/*! ZDICT_trainFromBuffer(): + * Train a dictionary from an array of samples. + * Redirect towards ZDICT_optimizeTrainFromBuffer_fastCover() single-threaded, with d=8, steps=4, + * f=20, and accel=1. + * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, + * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. + * The resulting dictionary will be saved into `dictBuffer`. + * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) + * or an error code, which can be tested with ZDICT_isError(). + * Note: Dictionary training will fail if there are not enough samples to construct a + * dictionary, or if most of the samples are too small (< 8 bytes being the lower limit). + * If dictionary training fails, you should use zstd without a dictionary, as the dictionary + * would've been ineffective anyways. If you believe your samples would benefit from a dictionary + * please open an issue with details, and we can look into it. + * Note: ZDICT_trainFromBuffer()'s memory usage is about 6 MB. + * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. + * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. + * In general, it's recommended to provide a few thousands samples, though this can vary a lot. + * It's recommended that total size of all samples be about ~x100 times the target size of dictionary. + */ +|# ;;; Code: (in-package :zstd) +(deferror zstd-ddict-error (zstd-alien-error) ()) +(deferror zstd-cdict-error (zstd-alien-error) + () + (:report (lambda (c s) + (format s "ZSTD CDict signalled error: ~A" (zstd-errorcode* (zstd-error-code c)))))) (define-alien-enum (zstd-dict-content-type int) :auto 0 @@ -114,3 +283,17 @@ (define-alien-routine "ZSTD_getDictID_fromFrame" unsigned (src (* t)) (src-size size-t)) + +(define-alien-routine "ZSTD_estimatedDictSize" size-t (dict-size size-t) (dict-load-method zstd-dict-load-method)) + +(defmacro with-zstd-cdict ((cv &key buffer size (level (zstd-defaultclevel))) &body body) + (let ((size (or size (length buffer)))) + `(with-alien ((,cv (* zstd-cdict) (zstd-createcdict (cast (octets-to-alien ,buffer) (* t)) ,size ,level))) + (unwind-protect (progn ,@body) + (zstd-freecdict ,cv))))) + +(defmacro with-zstd-ddict ((dv &key buffer size) &body body) + (let ((size (or size (length buffer)))) + `(with-alien ((,dv (* zstd-ddict) (zstd-createddict (cast (octets-to-alien ,buffer) (* t)) ,size))) + (unwind-protect (progn ,@body) + (zstd-freeddict ,dv))))) diff -r b499d4bcfc39 -r 937a6f354047 lisp/ffi/zstd/pkg.lisp --- a/lisp/ffi/zstd/pkg.lisp Tue Sep 17 22:19:19 2024 -0400 +++ b/lisp/ffi/zstd/pkg.lisp Wed Sep 18 21:48:06 2024 -0400 @@ -67,7 +67,13 @@ :zstd-cstream :zstd-dstream :zstd-compressstream :zstd-decompressstream :zstd-compressstream2 :zstd-outbuffer :zstd-geterrorname :zstd-geterrorcode :zstdc :zstdd - :zstd-alien-error :zstd-dstream-error :zstd-cstream-error)) + :zstd-alien-error :zstd-dstream-error :zstd-cstream-error + :with-zstd-streams + :with-zstd-buffers + :with-zstd-outbuffer + :with-zstd-inbuffer + :with-zstd-cdict + :with-zstd-ddict)) (in-package :zstd) diff -r b499d4bcfc39 -r 937a6f354047 lisp/ffi/zstd/stream.lisp --- a/lisp/ffi/zstd/stream.lisp Tue Sep 17 22:19:19 2024 -0400 +++ b/lisp/ffi/zstd/stream.lisp Wed Sep 18 21:48:06 2024 -0400 @@ -56,6 +56,41 @@ (define-alien-routine "ZSTD_DStreamInSize" size-t) (define-alien-routine "ZSTD_DStreamOutSize" size-t) +(defmacro with-zstd-inbuffer ((iv &key src size pos) &body body) + `(with-alien ((,iv (* zstd-inbuffer) (allocate-zstd-inbuffer))) + (unwind-protect + (progn + ,@(when src `((setf (zstd-inbuffer-src ,iv) ,src))) + ,@(when size `((setf (zstd-inbuffer-size ,iv) ,size))) + ,@(when pos `((setf (zstd-inbuffer-pos ,iv) ,pos))) + ,@body) + (free-alien ,iv)))) + +(defmacro with-zstd-outbuffer ((ov &key dst size pos) &body body) + `(with-alien ((,ov (* zstd-outbuffer) (allocate-zstd-outbuffer))) + (unwind-protect + (progn + ,@(when dst `((setf (zstd-outbuffer-dst ,ov) ,dst))) + ,@(when size `((setf (zstd-outbuffer-size ,ov) ,size))) + ,@(when pos `((setf (zstd-outbuffer-pos ,ov) ,pos))) + ,@body) + (free-alien ,ov)))) + +(defmacro with-zstd-buffers ((iv ov &key src src-size src-pos dst dst-size dst-pos) &body body) + `(with-alien ((,iv (* zstd-inbuffer) (allocate-zstd-inbuffer)) + (,ov (* zstd-outbuffer) (allocate-zstd-outbuffer))) + (unwind-protect + (progn + ,@(when src `((setf (zstd-inbuffer-src ,iv) ,src))) + ,@(when src-size `((setf (zstd-inbuffer-size ,iv) ,src-size))) + ,@(when src-pos `((setf (zstd-inbuffer-pos ,iv) ,src-pos))) + ,@(when dst `((setf (zstd-outbuffer-dst ,ov) ,dst))) + ,@(when dst-size `((setf (zstd-outbuffer-size ,ov) ,dst-size))) + ,@(when dst-pos `((setf (zstd-outbuffer-pos ,ov) ,dst-pos))) + ,@body) + (free-alien ,iv) + (free-alien ,ov)))) + (defmacro with-zstd-cstream ((cv &key (init t) (close t) (level (zstd-defaultclevel)) ) &body body) `(with-alien ((,cv (* zstd-cstream) (zstd-createcstream))) (unwind-protect @@ -75,3 +110,19 @@ (zstd-dstream-error %dinit))))) ,@body) ,@(when close `((zstd-freedstream ,dv)))))) + +(defmacro with-zstd-streams ((cv dv &key (init t) (close t) (level (zstd-defaultclevel))) &body body) + `(with-alien ((,cv (* zstd-cstream) (zstd-createcstream)) + (,dv (* zstd-dstream) (zstd-createdstream))) + (unwind-protect + (progn + ,@(when init `((let ((%cinit (zstd-initcstream ,cv ,level)) + (%dinit (zstd-initdstream ,dv))) + ;; TODO 2024-09-18: + (unless (zerop (zstd-iserror %cinit)) + (zstd-cstream-error %cinit)) + (unless (zerop (zstd-iserror %cinit)) + (zstd-cstream-error %dinit))))) + ,@body) + ,@(when close `((zstd-freecstream ,cv) + (zstd-freedstream ,dv)))))) diff -r b499d4bcfc39 -r 937a6f354047 lisp/ffi/zstd/tests.lisp --- a/lisp/ffi/zstd/tests.lisp Tue Sep 17 22:19:19 2024 -0400 +++ b/lisp/ffi/zstd/tests.lisp Wed Sep 18 21:48:06 2024 -0400 @@ -45,14 +45,14 @@ (is (< (zstd-cstreaminsize) (zstd-cstreamoutsize))) (with-alien ((in (* zstd-inbuffer) (zstd::allocate-zstd-inbuffer)) (out (* zstd-outbuffer) (zstd::allocate-zstd-outbuffer))) - (let* ((str "this is a test yad ayd ay aya dayd ayd ada") - (len (length str))) + (let* ((str "this is a test yad ayd ay aya dayd ayd ada")) (setf (zstd::zstd-inbuffer-src in) (make-alien-string str) - (zstd::zstd-inbuffer-size in) len) + (zstd::zstd-inbuffer-size in) (zstd-cstreaminsize)) (with-zstd-cstream (cs) (is (zerop (zstd::zstd-initcstream cs (zstd-defaultclevel)))) (with-zstd-dstream (ds) ;; (setf (zstd::zstd-outbuffer-dst out) (make-alien-string str)) + (setf (zstd::zstd-outbuffer-size out) (zstd-cstreamoutsize)) (zstd-compressstream cs out in) (zstd::zstd-flushstream cs out) (zstd::zstd-endstream cs out) @@ -62,10 +62,47 @@ c-string) str))))))) -(deftest cstream () - "Test streaming compression based on zstd.h HowTo guide.") - (deftest streaming2 () - "Test the Zstd v2 Streaming API.") -;; simple-dictionary -;; builk-dictionary + "Test the Zstd v2 Streaming API." + (let ((test "test 1 2 3")) + (with-zstd-buffers (in out :src (make-alien-string test)) + (with-zstd-streams (cs ds) + (zstd-compressstream2 cs out in 0) + (zstd-compressstream2 cs out in 1) + (is (zerop (zstd-iserror (zstd-compressstream2 cs out in 2)))) + (zstd::zstd-flushstream cs out) + (is (zerop (zstd-iserror (zstd::zstd-endstream cs out)))) + (zstd-decompressstream ds out in) + (is (string-equal + (cast (zstd::zstd-inbuffer-src in) c-string) + test)))))) + +(deftest simple-dictionary () + (let ((test "test 1 2 3")) + (with-alien ((dict (* t)) + (dst (array (unsigned 8) 100))) + (with-zstd-buffers (in out :src (cast (make-alien-string test) (* t)) :dst (cast dst (* t)) :dst-size 100) + (is (= 100 (zstd::zstd-outbuffer-size out))) + (with-zstd-streams (cs ds) + (is + (zerop + (zstd-iserror + (zstd::zstd-compress-usingdict + cs + (zstd::zstd-outbuffer-dst out) (zstd::zstd-outbuffer-size out) + (zstd::zstd-inbuffer-src in) (zstd::zstd-inbuffer-size in) + dict (length test) (zstd-defaultclevel))))) + (is + (zerop + (zstd-iserror + (zstd::zstd-decompress-usingdict + ds + (zstd::zstd-outbuffer-dst out) (zstd::zstd-outbuffer-size out) + (zstd::zstd-inbuffer-src in) (zstd::zstd-inbuffer-size in) + dict (length test)))))))))) + +(deftest bulk-dictionary () + (with-zstd-ddict (dd :buffer #(1 2 3)) + (is (typep dd '(alien (* (struct zstd::zstd-ddict-s)))))) + (with-zstd-cdict (cd :buffer #(4 5 6)) + (is (typep cd '(alien (* (struct zstd::zstd-cdict-s))))))) diff -r b499d4bcfc39 -r 937a6f354047 lisp/std/alien.lisp --- a/lisp/std/alien.lisp Tue Sep 17 22:19:19 2024 -0400 +++ b/lisp/std/alien.lisp Wed Sep 18 21:48:06 2024 -0400 @@ -99,17 +99,26 @@ (push c-string reversed-result) (return (nreverse reversed-result))))))) -(defmacro clone-octets-to-alien (lispa aliena) - (with-gensyms (i) - `(loop for ,i from 0 below (length ,lispa) - do (setf (deref ,aliena ,i) - (aref ,lispa ,i))))) +(defun clone-octets-to-alien (lispa aliena) + (declare (optimize (speed 3))) + (loop for i from 0 below (length lispa) + do (setf (deref aliena i) + (aref lispa i))) + aliena) -(defmacro clone-octets-from-alien (aliena lispa len) - (with-gensyms (i) - `(loop for ,i from 0 below ,len - do (setf (aref ,lispa ,i) - (deref ,aliena ,i))))) +(defmacro octets-to-alien (lispa) + (with-gensyms (a) + `(with-alien ((,a (array (unsigned 8) ,(length lispa)))) + (clone-octets-to-alien ,lispa ,a)))) + +(defun clone-octets-from-alien (aliena lispa &optional len) + (declare (optimize (speed 3)) + (array lispa)) + (unless len (setf len (length lispa))) + (loop for i from 0 below len + do (setf (aref lispa i) + (deref aliena i))) + lispa) (defun foreign-int-to-integer (buffer size) "Check SIZE of int BUFFER. return BUFFER." diff -r b499d4bcfc39 -r 937a6f354047 lisp/std/pkg.lisp --- a/lisp/std/pkg.lisp Tue Sep 17 22:19:19 2024 -0400 +++ b/lisp/std/pkg.lisp Wed Sep 18 21:48:06 2024 -0400 @@ -156,6 +156,7 @@ :copy-c-string :clone-strings :clone-octets-to-alien + :octets-to-alien :clone-octets-from-alien :foreign-int-to-integer :foreign-int-to-bool diff -r b499d4bcfc39 -r 937a6f354047 lisp/tests.lisp --- a/lisp/tests.lisp Tue Sep 17 22:19:19 2024 -0400 +++ b/lisp/tests.lisp Wed Sep 18 21:48:06 2024 -0400 @@ -17,4 +17,4 @@ (mapcar (lambda (x) (do-tests x force)) (remove *test-suite* *test-suite-list*))) (deftest all () - (run-all-tests t)) + (do-tests *test-suite-list*