1.1--- a/lisp/lib/dat/dat.asd Sat Jul 13 00:03:13 2024 -0400
1.2+++ b/lisp/lib/dat/dat.asd Sat Jul 13 18:18:01 2024 -0400
1.3@@ -11,9 +11,12 @@
1.4 (:file "json")
1.5 (:module "parquet"
1.6 :components
1.7- ((:file "pkg")
1.8- (:file "gen")
1.9- (:file "parquet")))
1.10+ ((:file "gen")
1.11+ (:file "pkg")
1.12+ (:file "obj")
1.13+ (:file "io")
1.14+ (:file "rle")
1.15+ (:file "proto")))
1.16 (:module "xml"
1.17 :components
1.18 ((:file "xml")
2.1--- a/lisp/lib/dat/parquet/gen.lisp Sat Jul 13 00:03:13 2024 -0400
2.2+++ b/lisp/lib/dat/parquet/gen.lisp Sat Jul 13 18:18:01 2024 -0400
2.3@@ -3,6 +3,10 @@
2.4 ;;
2.5
2.6 ;;; Code:
2.7+(defpackage :dat/parquet/gen ;; not public API
2.8+ (:use :cl :std :dat/proto :dat/json)
2.9+ (:export :load-parquet))
2.10+
2.11 (in-package :dat/parquet/gen)
2.12 (defparameter *parquet-json-file*
2.13 (probe-file #.(asdf:system-relative-pathname :prelude #P"../.stash/parquet.json")))
2.14@@ -24,7 +28,8 @@
2.15 (defmacro def-parquet-enum (sym name)
2.16 `(progn
2.17 (defun ,(symbolicate "PARQUET-JSON-" sym) ()
2.18- (mapcar (lambda (x) (json-getf x "name")) (parquet-json-enum-getf ,name)))
2.19+ (mapcar (lambda (x) (keywordicate (snakecase-name-to-lisp-name (json-getf x "name"))))
2.20+ (parquet-json-enum-getf ,name)))
2.21 (defparameter ,(intern
2.22 (concatenate 'string "*PARQUET-" (symbol-name sym) "*")
2.23 :dat/parquet)
2.24@@ -205,7 +210,6 @@
2.25 (defun load-parquet (&key (file *parquet-json-file*))
2.26 (init-parquet-json file)
2.27 (with-package (:dat/parquet)
2.28- (define-parquet-class parquet-enum-object () ())
2.29 (define-parquet-class parquet-struct-object () ())
2.30 (let ((types (define-parquet-types)))
2.31 (export types)
3.1--- /dev/null Thu Jan 01 00:00:00 1970 +0000
3.2+++ b/lisp/lib/dat/parquet/io.lisp Sat Jul 13 18:18:01 2024 -0400
3.3@@ -0,0 +1,18 @@
3.4+;;; io.lisp --- Parquet IO
3.5+
3.6+;;
3.7+
3.8+;;; Code:
3.9+(in-package :dat/parquet)
3.10+
3.11+;;; Read/Write
3.12+(defun parquet-write-magic (stream)
3.13+ (write-string +parquet-magic-number+ stream))
3.14+
3.15+(defun parquet-read-magic (stream)
3.16+ (assert (char= #.(char +parquet-magic-number+ 0) (read-char stream)))
3.17+ (assert (char= #.(char +parquet-magic-number+ 1) (read-char stream)))
3.18+ (assert (char= #.(char +parquet-magic-number+ 2) (read-char stream)))
3.19+ (assert (char= #.(char +parquet-magic-number+ 3) (read-char stream)))
3.20+ t)
3.21+
4.1--- /dev/null Thu Jan 01 00:00:00 1970 +0000
4.2+++ b/lisp/lib/dat/parquet/obj.lisp Sat Jul 13 18:18:01 2024 -0400
4.3@@ -0,0 +1,23 @@
4.4+;;; obj.lisp --- Parquet Objects
4.5+
4.6+;; Parquet class and type definitions generated from parquet.json.
4.7+
4.8+;;; Code:
4.9+(in-package :dat/parquet)
4.10+
4.11+(eval-always
4.12+ (dat/parquet/gen::load-parquet))
4.13+
4.14+(deftype parquet-compression-codec () `(member ,*parquet-compression-codecs*))
4.15+
4.16+(deftype parquet-boundary-order () `(member ,*parquet-boundary-orders*))
4.17+
4.18+(deftype parquet-encoding () `(member ,*parquet-encodings*))
4.19+
4.20+(deftype parquet-field-repetition () `(member ,*parquet-field-repetition-types*))
4.21+
4.22+(deftype parquet-type-designator () `(member ,*parquet-types*))
4.23+
4.24+(deftype parquet-converted-type-designator () `(member ,*parquet-converted-types*))
4.25+
4.26+(deftype parquet-page-type () `(member ,*parquet-page-types*))
5.1--- a/lisp/lib/dat/parquet/parquet.lisp Sat Jul 13 00:03:13 2024 -0400
5.2+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
5.3@@ -1,102 +0,0 @@
5.4-;;; parquet.lisp --- Apache Parquet
5.5-
5.6-;; Common Lisp implementation of Apache Parquet
5.7-
5.8-;;; Commentary:
5.9-
5.10-#|
5.11-https://github.com/apache/parquet-format
5.12-https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
5.13-https://github.com/apache/parquet-testing
5.14-https://github.com/apache/parquet-java
5.15-https://github.com/apache/arrow-rs
5.16-
5.17-https://thrift.apache.org/docs/types
5.18-|#
5.19-
5.20-#|
5.21- 4-byte magic number "PAR1"
5.22- <Column 1 Chunk 1>
5.23- <Column 2 Chunk 1>
5.24- ...
5.25- <Column N Chunk 1>
5.26- <Column 1 Chunk 2>
5.27- <Column 2 Chunk 2>
5.28- ...
5.29- <Column N Chunk 2>
5.30- ...
5.31- <Column 1 Chunk M>
5.32- <Column 2 Chunk M>
5.33- ...
5.34- <Column N Chunk M>
5.35- File Metadata
5.36- 4-byte length in bytes of file metadata (little endian)
5.37- 4-byte magic number "PAR1"
5.38-|#
5.39-
5.40-;; In this package we're being as lazy as possible. To generate our own
5.41-;; encoder/decoder methods we depend on the file parquet.thrift in the
5.42-;; parquet-format repo above. The core skelfile includes a script to download
5.43-;; it and convert it to parquet.json (requires the thirft cli tool). We then
5.44-;; decode it with DAT/JSON and generate lisp classes, and types.
5.45-
5.46-;;
5.47-;;; Code:
5.48-(in-package :dat/parquet)
5.49-(eval-always
5.50- (dat/parquet/gen::load-parquet))
5.51-
5.52-(defgeneric parquet-read (value &optional stream))
5.53-(defgeneric parquet-write (value &optional stream))
5.54-
5.55-;; HACK 2024-07-12:
5.56-(define-bitfield parquet-compression-codec
5.57- (uncompressed boolean)
5.58- (snappy boolean)
5.59- (gzip boolean)
5.60- (lzo boolean)
5.61- (brotli boolean)
5.62- (lz4 boolean)
5.63- (zstd boolean)
5.64- (lz4-raw boolean))
5.65-
5.66-;;; Read/Write
5.67-(define-constant +parquet-magic-number+ "PAR1" :test 'equal)
5.68-
5.69-(defconstant +default-parquet-page-size+ (* 8 1024)) ;; 8kb
5.70-(defconstant +default-parquet-row-group-size (expt 1024 3)) ;; 1gb
5.71-
5.72-(defvar *parquet-creator* "parquet-cl version 0.1.0")
5.73-
5.74-(defun parquet-write-magic (stream)
5.75- (write-string +parquet-magic-number+ stream))
5.76-
5.77-(defun parquet-read-magic (stream)
5.78- (assert (char= #.(char +parquet-magic-number+ 0) (read-char stream)))
5.79- (assert (char= #.(char +parquet-magic-number+ 1) (read-char stream)))
5.80- (assert (char= #.(char +parquet-magic-number+ 2) (read-char stream)))
5.81- (assert (char= #.(char +parquet-magic-number+ 3) (read-char stream)))
5.82- t)
5.83-
5.84-(defmethod parquet-write ((value (eql t)) &optional stream)
5.85- "Encode a parquet boolean true value."
5.86- (declare (ignore value))
5.87- (write-byte 1 stream))
5.88-
5.89-(defmethod parquet-write ((value (eql nil)) &optional stream)
5.90- "Encode a parquet boolean false value."
5.91- (declare (ignore value))
5.92- (write-byte 0 stream))
5.93-
5.94-(defmethod parquet-write ((value string) &optional stream))
5.95-
5.96-;;; Encode/Decode
5.97-(defun parquet-encode (value &optional stream)
5.98- "Encode a Lisp value and write it to a parquet stream."
5.99- (parquet-write value stream))
5.100-
5.101-(defun parquet-decode (string &key (start 0) end)
5.102- "Convert a PARQUET string into a Lisp object."
5.103- (with-input-from-string (stream string :start start :end end)
5.104- (values (parquet-read stream)
5.105- (file-position stream))))
6.1--- a/lisp/lib/dat/parquet/pkg.lisp Sat Jul 13 00:03:13 2024 -0400
6.2+++ b/lisp/lib/dat/parquet/pkg.lisp Sat Jul 13 18:18:01 2024 -0400
6.3@@ -1,10 +1,57 @@
6.4-;;; pkg.lisp --- Parquet Packages
6.5+;;; pkg.lisp --- Apache Parquet Packages
6.6+
6.7+;; Common Lisp Parquet Implementation
6.8+
6.9+;;; Commentary:
6.10+
6.11+#|
6.12+https://github.com/apache/parquet-format
6.13+https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
6.14+https://github.com/apache/parquet-testing
6.15+https://github.com/apache/parquet-java
6.16+https://github.com/apache/arrow-rs
6.17+https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36632.pdf
6.18+https://thrift.apache.org/docs/types
6.19+|#
6.20
6.21-;;
6.22+#|
6.23+ 4-byte magic number "PAR1"
6.24+ <Column 1 Chunk 1>
6.25+ <Column 2 Chunk 1>
6.26+ ...
6.27+ <Column N Chunk 1>
6.28+ <Column 1 Chunk 2>
6.29+ <Column 2 Chunk 2>
6.30+ ...
6.31+ <Column N Chunk 2>
6.32+ ...
6.33+ <Column 1 Chunk M>
6.34+ <Column 2 Chunk M>
6.35+ ...
6.36+ <Column N Chunk M>
6.37+ File Metadata
6.38+ 4-byte length in bytes of file metadata (little endian)
6.39+ 4-byte magic number "PAR1"
6.40+|#
6.41+
6.42+;; In this file we're being as lazy as possible. To generate our base objects
6.43+;; we depend on the file parquet.thrift in the parquet-format repo. The core
6.44+;; skelfile includes a script to download it and convert it to parquet.json
6.45+;; (requires the thirft cli tool). We then decode it with DAT/JSON and
6.46+;; generate lisp classes, and types.
6.47+
6.48+;; NOTE: there is actually a Common Lisp code generate for Thrift. It seems to
6.49+;; work but it requires an ASDF system named thrift which I couldn't find
6.50+;; anywhere. Granted I didn't look that hard, but I don't think it matters
6.51+;; because we ultimately don't want to depend on the Thrift CLI tool for
6.52+;; codegen.
6.53
6.54 ;;; Code:
6.55 (in-package :dat/parquet)
6.56
6.57-(defpackage :dat/parquet/gen
6.58- (:use :cl :std :dat/proto :dat/json)
6.59- (:export :load-parquet))
6.60+(define-constant +parquet-magic-number+ "PAR1" :test 'equal)
6.61+
6.62+(defconstant +default-parquet-page-size+ (* 8 1024)) ;; 8kb
6.63+(defconstant +default-parquet-row-group-size (expt 1024 3)) ;; 1gb
6.64+
6.65+(defvar *parquet-creator* "dat/parquet version 0.1.0")
7.1--- /dev/null Thu Jan 01 00:00:00 1970 +0000
7.2+++ b/lisp/lib/dat/parquet/proto.lisp Sat Jul 13 18:18:01 2024 -0400
7.3@@ -0,0 +1,32 @@
7.4+;;; proto.lisp --- Parquet Data Protocol
7.5+
7.6+;;
7.7+
7.8+;;; Code:
7.9+(in-package :dat/parquet)
7.10+
7.11+(defgeneric parquet-read (value &optional stream))
7.12+(defgeneric parquet-write (value &optional stream))
7.13+
7.14+(defmethod parquet-write ((value (eql t)) &optional stream)
7.15+ "Encode a parquet boolean true value."
7.16+ (declare (ignore value))
7.17+ (write-byte 1 stream))
7.18+
7.19+(defmethod parquet-write ((value (eql nil)) &optional stream)
7.20+ "Encode a parquet boolean false value."
7.21+ (declare (ignore value))
7.22+ (write-byte 0 stream))
7.23+
7.24+(defmethod parquet-write ((value string) &optional stream))
7.25+
7.26+;;; Encode/Decode
7.27+(defun parquet-encode (value &optional stream)
7.28+ "Encode a Lisp value and write it to a parquet stream."
7.29+ (parquet-write value stream))
7.30+
7.31+(defun parquet-decode (string &key (start 0) end)
7.32+ "Convert a PARQUET string into a Lisp object."
7.33+ (with-input-from-string (stream string :start start :end end)
7.34+ (values (parquet-read stream)
7.35+ (file-position stream))))
8.1--- /dev/null Thu Jan 01 00:00:00 1970 +0000
8.2+++ b/lisp/lib/dat/parquet/rle.lisp Sat Jul 13 18:18:01 2024 -0400
8.3@@ -0,0 +1,31 @@
8.4+;;; rle.lisp --- Parquet Run Length Encoding
8.5+
8.6+;;
8.7+
8.8+;;; Commentary:
8.9+
8.10+#|
8.11+rle-bit-packed-hybrid: <length> <encoded-data>
8.12+// length is not always prepended, please check the table below for more detail
8.13+length := length of the <encoded-data> in bytes stored as 4 bytes little endian (unsigned int32)
8.14+encoded-data := <run>*
8.15+run := <bit-packed-run> | <rle-run>
8.16+bit-packed-run := <bit-packed-header> <bit-packed-values>
8.17+bit-packed-header := varint-encode(<bit-pack-scaled-run-len> << 1 | 1)
8.18+// we always bit-pack a multiple of 8 values at a time, so we only store the number of values / 8
8.19+bit-pack-scaled-run-len := (bit-packed-run-len) / 8
8.20+bit-packed-run-len := *see 3 below*
8.21+bit-packed-values := *see 1 below*
8.22+rle-run := <rle-header> <repeated-value>
8.23+rle-header := varint-encode( (rle-run-len) << 1)
8.24+rle-run-len := *see 3 below*
8.25+repeated-value := value that is repeated, using a fixed-width of round-up-to-next-byte(bit-width)
8.26+|#
8.27+
8.28+;; RLE is only supported for the following data types:
8.29+;; - Repetition and definition levels
8.30+;; - Dictionary indices
8.31+;; - Boolean values in data pages, as an alternative to PLAIN encoding
8.32+
8.33+;;; Code:
8.34+(in-package :dat/parquet)
9.1--- a/lisp/std/bit.lisp Sat Jul 13 00:03:13 2024 -0400
9.2+++ b/lisp/std/bit.lisp Sat Jul 13 18:18:01 2024 -0400
9.3@@ -498,22 +498,13 @@
9.4 (aref hexdigits (ldb (byte 4 0) byte))))
9.5 finally (return string))))
9.6
9.7-(defun octets-to-integer (octet-vec &optional (end (length octet-vec)))
9.8+(defun octets-to-integer (octet-vec &optional (bytes (length octet-vec)))
9.9 (declare (type (simple-array (unsigned-byte 8)) octet-vec))
9.10 (do ((j 0 (1+ j))
9.11 (sum 0))
9.12- ((>= j end) sum)
9.13+ ((>= j bytes) sum)
9.14 (setf sum (+ (aref octet-vec j) (ash sum 8)))))
9.15
9.16-(defun read-little-endian (s &optional (bytes 4))
9.17- "Read a number in little-endian format from an byte (octet) stream S,
9.18-the number having BYTES octets (defaulting to 4)."
9.19- (loop for i from 0 below bytes
9.20- sum (ash (read-byte s) (* 8 i))))
9.21-
9.22-(defun write-little-endian (i s &optional (bytes 4))
9.23- (write-sequence (nreverse (integer-to-octets i (* 8 bytes))) s))
9.24-
9.25 (defun integer-to-octets (bignum &optional (n-bits (integer-length bignum)))
9.26 (let* ((n-bytes (ceiling n-bits 8))
9.27 (octet-vec (make-array n-bytes :element-type '(unsigned-byte 8))))
9.28@@ -522,3 +513,25 @@
9.29 for index from 0
9.30 do (setf (aref octet-vec index) (ldb (byte 8 (* i 8)) bignum))
9.31 finally (return octet-vec))))
9.32+
9.33+(defun octets-to-integer-le (octet-vec &optional (bytes (length octet-vec)))
9.34+ (declare (type (simple-array (unsigned-byte 8)) octet-vec))
9.35+ (loop for i from 0 below bytes
9.36+ sum (ash (aref octet-vec i) (* 8 i))))
9.37+
9.38+(defun integer-to-octets-le (bignum &optional (n-bits (integer-length bignum)))
9.39+ (let* ((n-bytes (ceiling n-bits 8))
9.40+ (octet-vec (make-array n-bytes :element-type '(unsigned-byte 8))))
9.41+ (declare (type (simple-array (unsigned-byte 8)) octet-vec))
9.42+ (loop for i from 0 below n-bytes
9.43+ do (setf (aref octet-vec i) (ldb (byte 8 (* i 8)) bignum))
9.44+ finally (return octet-vec))))
9.45+
9.46+(defun read-little-endian (s &optional (bytes 4))
9.47+ "Read a number in little-endian format from an byte (octet) stream S,
9.48+the number having BYTES octets (defaulting to 4)."
9.49+ (loop for i from 0 below bytes
9.50+ sum (ash (read-byte s) (* 8 i))))
9.51+
9.52+(defun write-little-endian (i s &optional (bytes 4))
9.53+ (write-sequence (integer-to-octets-le i bytes) s))
10.1--- a/lisp/std/num/float.lisp Sat Jul 13 00:03:13 2024 -0400
10.2+++ b/lisp/std/num/float.lisp Sat Jul 13 18:18:01 2024 -0400
10.3@@ -1,6 +1,16 @@
10.4 ;;; std/num/float.lisp --- Floating Point Numbers
10.5
10.6-;;
10.7+;; IEEE 754 Floating Point encoding and decoding.
10.8+
10.9+;;; Commentary:
10.10+
10.11+;; This package provides default encoders for float32 and float64 as defined
10.12+;; by IEEE 754.
10.13+
10.14+;; Note that the physical encoding is always represented as a fixnum.
10.15+
10.16+;; To read/write from a file you must pass through a fixnum repr to bytes,
10.17+;; usually using octets-to-integer or integer-to-octets. There are also
10.18
10.19 ;;; Code:
10.20
10.21@@ -11,6 +21,7 @@
10.22
10.23 (in-package :std/num)
10.24 (declaim (optimize (speed 3)))
10.25+
10.26 ;; The following macro may look a bit overcomplicated to the casual
10.27 ;; reader. The main culprit is the fact that NaN and infinity can be
10.28 ;; optionally included, which adds a bunch of conditional parts.
11.1--- /dev/null Thu Jan 01 00:00:00 1970 +0000
11.2+++ b/lisp/std/num/leb128.lisp Sat Jul 13 18:18:01 2024 -0400
11.3@@ -0,0 +1,12 @@
11.4+;;; leb128.lisp --- Little-Endian Base 128 Variable Encoding
11.5+
11.6+;; (U)LEB128 encoders
11.7+
11.8+;;; Commentary:
11.9+
11.10+;; ref: https://en.wikipedia.org/wiki/LEB128
11.11+;; opt: https://arxiv.org/abs/1503.07387 VByte
11.12+;; opt: https://arxiv.org/pdf/1709.08990 VByte streaming
11.13+
11.14+;;; Code:
11.15+(in-package :std/num)
12.1--- a/lisp/std/pkg.lisp Sat Jul 13 00:03:13 2024 -0400
12.2+++ b/lisp/std/pkg.lisp Sat Jul 13 18:18:01 2024 -0400
12.3@@ -351,6 +351,10 @@
12.4 :octet-vector-to-hex-string
12.5 :octets-to-integer
12.6 :integer-to-octets
12.7+ :octets-to-integer-le
12.8+ :integer-to-octets-le
12.9+ :read-little-endian
12.10+ :write-little-endian
12.11 :hexchar-to-int))
12.12
12.13 (defpkg :std/fmt
13.1--- a/lisp/std/std.asd Sat Jul 13 00:03:13 2024 -0400
13.2+++ b/lisp/std/std.asd Sat Jul 13 18:18:01 2024 -0400
13.3@@ -28,7 +28,8 @@
13.4 (:module "num"
13.5 :components
13.6 ((:file "float")
13.7- (:file "parse")))
13.8+ (:file "parse")
13.9+ (:file "leb128")))
13.10 (:file "stream")
13.11 (:module "fu"
13.12 :components
14.1--- a/lisp/std/task.lisp Sat Jul 13 00:03:13 2024 -0400
14.2+++ b/lisp/std/task.lisp Sat Jul 13 18:18:01 2024 -0400
14.3@@ -69,7 +69,7 @@
14.4 (defvar *default-worker-name* "worker")
14.5
14.6 (defclass worker ()
14.7- ((thread :initform (%make-thread #.#1=(symbol-name (gensym "w")) t (make-semaphore :name #.#1#))
14.8+ ((thread :initform (sb-thread::%make-thread #.#1=(symbol-name (gensym "w")) t (make-semaphore :name #.#1#))
14.9 :accessor worker-thread
14.10 :initarg :thread)
14.11 (function :type function :accessor worker-function :initarg :function)