changelog shortlog graph tags branches files raw help

Mercurial > core / changeset: parquet expansion, init leb128, add little-endian octet encoders

changeset 544: ec1d4d544c36
parent 543: b88bd4b0a039
child 545: 312eb5995ed4
author: Richard Westhaver <ellis@rwest.io>
date: Sat, 13 Jul 2024 18:18:01 -0400
files: lisp/lib/dat/dat.asd lisp/lib/dat/parquet/gen.lisp lisp/lib/dat/parquet/io.lisp lisp/lib/dat/parquet/obj.lisp lisp/lib/dat/parquet/parquet.lisp lisp/lib/dat/parquet/pkg.lisp lisp/lib/dat/parquet/proto.lisp lisp/lib/dat/parquet/rle.lisp lisp/std/bit.lisp lisp/std/num/float.lisp lisp/std/num/leb128.lisp lisp/std/pkg.lisp lisp/std/std.asd lisp/std/task.lisp
description: parquet expansion, init leb128, add little-endian octet encoders
     1.1--- a/lisp/lib/dat/dat.asd	Sat Jul 13 00:03:13 2024 -0400
     1.2+++ b/lisp/lib/dat/dat.asd	Sat Jul 13 18:18:01 2024 -0400
     1.3@@ -11,9 +11,12 @@
     1.4                (:file "json")
     1.5                (:module "parquet"
     1.6                 :components
     1.7-                ((:file "pkg")
     1.8-                 (:file "gen")
     1.9-                 (:file "parquet")))
    1.10+                ((:file "gen")
    1.11+                 (:file "pkg")
    1.12+                 (:file "obj")
    1.13+                 (:file "io")
    1.14+                 (:file "rle")
    1.15+                 (:file "proto")))
    1.16                (:module "xml"
    1.17                 :components
    1.18                 ((:file "xml")
     2.1--- a/lisp/lib/dat/parquet/gen.lisp	Sat Jul 13 00:03:13 2024 -0400
     2.2+++ b/lisp/lib/dat/parquet/gen.lisp	Sat Jul 13 18:18:01 2024 -0400
     2.3@@ -3,6 +3,10 @@
     2.4 ;; 
     2.5 
     2.6 ;;; Code:
     2.7+(defpackage :dat/parquet/gen ;; not public API
     2.8+  (:use :cl :std :dat/proto :dat/json)
     2.9+  (:export :load-parquet))
    2.10+
    2.11 (in-package :dat/parquet/gen)
    2.12 (defparameter *parquet-json-file*
    2.13   (probe-file #.(asdf:system-relative-pathname :prelude #P"../.stash/parquet.json")))
    2.14@@ -24,7 +28,8 @@
    2.15 (defmacro def-parquet-enum (sym name)
    2.16   `(progn
    2.17      (defun ,(symbolicate "PARQUET-JSON-" sym) ()
    2.18-       (mapcar (lambda (x) (json-getf x "name")) (parquet-json-enum-getf ,name)))
    2.19+       (mapcar (lambda (x) (keywordicate (snakecase-name-to-lisp-name (json-getf x "name"))))
    2.20+               (parquet-json-enum-getf ,name)))
    2.21      (defparameter ,(intern
    2.22                      (concatenate 'string "*PARQUET-" (symbol-name sym) "*")
    2.23                      :dat/parquet)
    2.24@@ -205,7 +210,6 @@
    2.25 (defun load-parquet (&key (file *parquet-json-file*))
    2.26   (init-parquet-json file)
    2.27   (with-package (:dat/parquet)
    2.28-    (define-parquet-class parquet-enum-object () ())
    2.29     (define-parquet-class parquet-struct-object () ())
    2.30     (let ((types (define-parquet-types)))
    2.31       (export types)
     3.1--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2+++ b/lisp/lib/dat/parquet/io.lisp	Sat Jul 13 18:18:01 2024 -0400
     3.3@@ -0,0 +1,18 @@
     3.4+;;; io.lisp --- Parquet IO
     3.5+
     3.6+;; 
     3.7+
     3.8+;;; Code:
     3.9+(in-package :dat/parquet)
    3.10+
    3.11+;;; Read/Write
    3.12+(defun parquet-write-magic (stream)
    3.13+  (write-string +parquet-magic-number+ stream))
    3.14+
    3.15+(defun parquet-read-magic (stream)
    3.16+  (assert (char= #.(char +parquet-magic-number+ 0) (read-char stream)))
    3.17+  (assert (char= #.(char +parquet-magic-number+ 1) (read-char stream)))
    3.18+  (assert (char= #.(char +parquet-magic-number+ 2) (read-char stream)))
    3.19+  (assert (char= #.(char +parquet-magic-number+ 3) (read-char stream)))
    3.20+  t)
    3.21+
     4.1--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2+++ b/lisp/lib/dat/parquet/obj.lisp	Sat Jul 13 18:18:01 2024 -0400
     4.3@@ -0,0 +1,23 @@
     4.4+;;; obj.lisp --- Parquet Objects
     4.5+
     4.6+;; Parquet class and type definitions generated from parquet.json.
     4.7+
     4.8+;;; Code:
     4.9+(in-package :dat/parquet)
    4.10+
    4.11+(eval-always
    4.12+  (dat/parquet/gen::load-parquet))
    4.13+
    4.14+(deftype parquet-compression-codec () `(member ,*parquet-compression-codecs*))
    4.15+
    4.16+(deftype parquet-boundary-order () `(member ,*parquet-boundary-orders*))
    4.17+
    4.18+(deftype parquet-encoding () `(member ,*parquet-encodings*))
    4.19+
    4.20+(deftype parquet-field-repetition () `(member ,*parquet-field-repetition-types*))
    4.21+
    4.22+(deftype parquet-type-designator () `(member ,*parquet-types*))
    4.23+
    4.24+(deftype parquet-converted-type-designator () `(member ,*parquet-converted-types*))
    4.25+
    4.26+(deftype parquet-page-type () `(member ,*parquet-page-types*))
     5.1--- a/lisp/lib/dat/parquet/parquet.lisp	Sat Jul 13 00:03:13 2024 -0400
     5.2+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.3@@ -1,102 +0,0 @@
     5.4-;;; parquet.lisp --- Apache Parquet
     5.5-
     5.6-;; Common Lisp implementation of Apache Parquet
     5.7-
     5.8-;;; Commentary:
     5.9-
    5.10-#|
    5.11-https://github.com/apache/parquet-format
    5.12-https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
    5.13-https://github.com/apache/parquet-testing
    5.14-https://github.com/apache/parquet-java
    5.15-https://github.com/apache/arrow-rs
    5.16-
    5.17-https://thrift.apache.org/docs/types
    5.18-|#
    5.19-
    5.20-#|
    5.21-    4-byte magic number "PAR1"
    5.22-    <Column 1 Chunk 1>
    5.23-    <Column 2 Chunk 1>
    5.24-    ...
    5.25-    <Column N Chunk 1>
    5.26-    <Column 1 Chunk 2>
    5.27-    <Column 2 Chunk 2>
    5.28-    ...
    5.29-    <Column N Chunk 2>
    5.30-    ...
    5.31-    <Column 1 Chunk M>
    5.32-    <Column 2 Chunk M>
    5.33-    ...
    5.34-    <Column N Chunk M>
    5.35-    File Metadata
    5.36-    4-byte length in bytes of file metadata (little endian)
    5.37-    4-byte magic number "PAR1"
    5.38-|#
    5.39-
    5.40-;; In this package we're being as lazy as possible. To generate our own
    5.41-;; encoder/decoder methods we depend on the file parquet.thrift in the
    5.42-;; parquet-format repo above. The core skelfile includes a script to download
    5.43-;; it and convert it to parquet.json (requires the thirft cli tool). We then
    5.44-;; decode it with DAT/JSON and generate lisp classes, and types.
    5.45-
    5.46-;; 
    5.47-;;; Code:
    5.48-(in-package :dat/parquet)
    5.49-(eval-always
    5.50-  (dat/parquet/gen::load-parquet))
    5.51-
    5.52-(defgeneric parquet-read (value &optional stream))
    5.53-(defgeneric parquet-write (value &optional stream))
    5.54-
    5.55-;;  HACK 2024-07-12: 
    5.56-(define-bitfield parquet-compression-codec
    5.57-  (uncompressed boolean)
    5.58-  (snappy boolean)
    5.59-  (gzip boolean)
    5.60-  (lzo boolean)
    5.61-  (brotli boolean)
    5.62-  (lz4 boolean)
    5.63-  (zstd boolean)
    5.64-  (lz4-raw boolean))
    5.65-
    5.66-;;; Read/Write
    5.67-(define-constant +parquet-magic-number+ "PAR1" :test 'equal)
    5.68-
    5.69-(defconstant +default-parquet-page-size+ (* 8 1024)) ;; 8kb
    5.70-(defconstant +default-parquet-row-group-size (expt 1024 3)) ;; 1gb
    5.71-
    5.72-(defvar *parquet-creator* "parquet-cl version 0.1.0")
    5.73-
    5.74-(defun parquet-write-magic (stream)
    5.75-  (write-string +parquet-magic-number+ stream))
    5.76-
    5.77-(defun parquet-read-magic (stream)
    5.78-  (assert (char= #.(char +parquet-magic-number+ 0) (read-char stream)))
    5.79-  (assert (char= #.(char +parquet-magic-number+ 1) (read-char stream)))
    5.80-  (assert (char= #.(char +parquet-magic-number+ 2) (read-char stream)))
    5.81-  (assert (char= #.(char +parquet-magic-number+ 3) (read-char stream)))
    5.82-  t)
    5.83-
    5.84-(defmethod parquet-write ((value (eql t)) &optional stream)
    5.85-  "Encode a parquet boolean true value."
    5.86-  (declare (ignore value))
    5.87-  (write-byte 1 stream))
    5.88-
    5.89-(defmethod parquet-write ((value (eql nil)) &optional stream)
    5.90-  "Encode a parquet boolean false value."
    5.91-  (declare (ignore value))
    5.92-  (write-byte 0 stream))
    5.93-
    5.94-(defmethod parquet-write ((value string) &optional stream))
    5.95-
    5.96-;;; Encode/Decode
    5.97-(defun parquet-encode (value &optional stream)
    5.98-  "Encode a Lisp value and write it to a parquet stream."
    5.99-  (parquet-write value stream))
   5.100-
   5.101-(defun parquet-decode (string &key (start 0) end)
   5.102-  "Convert a PARQUET string into a Lisp object."
   5.103-  (with-input-from-string (stream string :start start :end end)
   5.104-    (values (parquet-read stream)
   5.105-            (file-position stream))))
     6.1--- a/lisp/lib/dat/parquet/pkg.lisp	Sat Jul 13 00:03:13 2024 -0400
     6.2+++ b/lisp/lib/dat/parquet/pkg.lisp	Sat Jul 13 18:18:01 2024 -0400
     6.3@@ -1,10 +1,57 @@
     6.4-;;; pkg.lisp --- Parquet Packages
     6.5+;;; pkg.lisp --- Apache Parquet Packages
     6.6+
     6.7+;; Common Lisp Parquet Implementation
     6.8+
     6.9+;;; Commentary:
    6.10+
    6.11+#|
    6.12+https://github.com/apache/parquet-format
    6.13+https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
    6.14+https://github.com/apache/parquet-testing
    6.15+https://github.com/apache/parquet-java
    6.16+https://github.com/apache/arrow-rs
    6.17+https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/36632.pdf
    6.18+https://thrift.apache.org/docs/types
    6.19+|#
    6.20 
    6.21-;; 
    6.22+#|
    6.23+    4-byte magic number "PAR1"
    6.24+    <Column 1 Chunk 1>
    6.25+    <Column 2 Chunk 1>
    6.26+    ...
    6.27+    <Column N Chunk 1>
    6.28+    <Column 1 Chunk 2>
    6.29+    <Column 2 Chunk 2>
    6.30+    ...
    6.31+    <Column N Chunk 2>
    6.32+    ...
    6.33+    <Column 1 Chunk M>
    6.34+    <Column 2 Chunk M>
    6.35+    ...
    6.36+    <Column N Chunk M>
    6.37+    File Metadata
    6.38+    4-byte length in bytes of file metadata (little endian)
    6.39+    4-byte magic number "PAR1"
    6.40+|#
    6.41+
    6.42+;; In this file we're being as lazy as possible. To generate our base objects
    6.43+;; we depend on the file parquet.thrift in the parquet-format repo. The core
    6.44+;; skelfile includes a script to download it and convert it to parquet.json
    6.45+;; (requires the thirft cli tool). We then decode it with DAT/JSON and
    6.46+;; generate lisp classes, and types.
    6.47+
    6.48+;; NOTE: there is actually a Common Lisp code generate for Thrift. It seems to
    6.49+;; work but it requires an ASDF system named thrift which I couldn't find
    6.50+;; anywhere. Granted I didn't look that hard, but I don't think it matters
    6.51+;; because we ultimately don't want to depend on the Thrift CLI tool for
    6.52+;; codegen.
    6.53 
    6.54 ;;; Code:
    6.55 (in-package :dat/parquet)
    6.56 
    6.57-(defpackage :dat/parquet/gen
    6.58-  (:use :cl :std :dat/proto :dat/json)
    6.59-  (:export :load-parquet))
    6.60+(define-constant +parquet-magic-number+ "PAR1" :test 'equal)
    6.61+
    6.62+(defconstant +default-parquet-page-size+ (* 8 1024)) ;; 8kb
    6.63+(defconstant +default-parquet-row-group-size (expt 1024 3)) ;; 1gb
    6.64+
    6.65+(defvar *parquet-creator* "dat/parquet version 0.1.0")
     7.1--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2+++ b/lisp/lib/dat/parquet/proto.lisp	Sat Jul 13 18:18:01 2024 -0400
     7.3@@ -0,0 +1,32 @@
     7.4+;;; proto.lisp --- Parquet Data Protocol
     7.5+
     7.6+;; 
     7.7+
     7.8+;;; Code:
     7.9+(in-package :dat/parquet)
    7.10+
    7.11+(defgeneric parquet-read (value &optional stream))
    7.12+(defgeneric parquet-write (value &optional stream))
    7.13+
    7.14+(defmethod parquet-write ((value (eql t)) &optional stream)
    7.15+  "Encode a parquet boolean true value."
    7.16+  (declare (ignore value))
    7.17+  (write-byte 1 stream))
    7.18+
    7.19+(defmethod parquet-write ((value (eql nil)) &optional stream)
    7.20+  "Encode a parquet boolean false value."
    7.21+  (declare (ignore value))
    7.22+  (write-byte 0 stream))
    7.23+
    7.24+(defmethod parquet-write ((value string) &optional stream))
    7.25+
    7.26+;;; Encode/Decode
    7.27+(defun parquet-encode (value &optional stream)
    7.28+  "Encode a Lisp value and write it to a parquet stream."
    7.29+  (parquet-write value stream))
    7.30+
    7.31+(defun parquet-decode (string &key (start 0) end)
    7.32+  "Convert a PARQUET string into a Lisp object."
    7.33+  (with-input-from-string (stream string :start start :end end)
    7.34+    (values (parquet-read stream)
    7.35+            (file-position stream))))
     8.1--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2+++ b/lisp/lib/dat/parquet/rle.lisp	Sat Jul 13 18:18:01 2024 -0400
     8.3@@ -0,0 +1,31 @@
     8.4+;;; rle.lisp --- Parquet Run Length Encoding
     8.5+
     8.6+;; 
     8.7+
     8.8+;;; Commentary:
     8.9+
    8.10+#|
    8.11+rle-bit-packed-hybrid: <length> <encoded-data>
    8.12+// length is not always prepended, please check the table below for more detail
    8.13+length := length of the <encoded-data> in bytes stored as 4 bytes little endian (unsigned int32)
    8.14+encoded-data := <run>*
    8.15+run := <bit-packed-run> | <rle-run>
    8.16+bit-packed-run := <bit-packed-header> <bit-packed-values>
    8.17+bit-packed-header := varint-encode(<bit-pack-scaled-run-len> << 1 | 1)
    8.18+// we always bit-pack a multiple of 8 values at a time, so we only store the number of values / 8
    8.19+bit-pack-scaled-run-len := (bit-packed-run-len) / 8
    8.20+bit-packed-run-len := *see 3 below*
    8.21+bit-packed-values := *see 1 below*
    8.22+rle-run := <rle-header> <repeated-value>
    8.23+rle-header := varint-encode( (rle-run-len) << 1)
    8.24+rle-run-len := *see 3 below*
    8.25+repeated-value := value that is repeated, using a fixed-width of round-up-to-next-byte(bit-width)
    8.26+|#
    8.27+
    8.28+;; RLE is only supported for the following data types:
    8.29+;; - Repetition and definition levels
    8.30+;; - Dictionary indices
    8.31+;; - Boolean values in data pages, as an alternative to PLAIN encoding
    8.32+
    8.33+;;; Code:
    8.34+(in-package :dat/parquet)
     9.1--- a/lisp/std/bit.lisp	Sat Jul 13 00:03:13 2024 -0400
     9.2+++ b/lisp/std/bit.lisp	Sat Jul 13 18:18:01 2024 -0400
     9.3@@ -498,22 +498,13 @@
     9.4                   (aref hexdigits (ldb (byte 4 0) byte))))
     9.5        finally (return string))))
     9.6 
     9.7-(defun octets-to-integer (octet-vec &optional (end (length octet-vec)))
     9.8+(defun octets-to-integer (octet-vec &optional (bytes (length octet-vec)))
     9.9   (declare (type (simple-array (unsigned-byte 8)) octet-vec))
    9.10   (do ((j 0 (1+ j))
    9.11        (sum 0))
    9.12-      ((>= j end) sum)
    9.13+      ((>= j bytes) sum)
    9.14     (setf sum (+ (aref octet-vec j) (ash sum 8)))))
    9.15 
    9.16-(defun read-little-endian (s &optional (bytes 4))
    9.17-  "Read a number in little-endian format from an byte (octet) stream S,
    9.18-the number having BYTES octets (defaulting to 4)."
    9.19-  (loop for i from 0 below bytes
    9.20-        sum (ash (read-byte s) (* 8 i))))
    9.21-
    9.22-(defun write-little-endian (i s &optional (bytes 4))
    9.23-  (write-sequence (nreverse (integer-to-octets i (* 8 bytes))) s))
    9.24-
    9.25 (defun integer-to-octets (bignum &optional (n-bits (integer-length bignum)))
    9.26   (let* ((n-bytes (ceiling n-bits 8))
    9.27          (octet-vec (make-array n-bytes :element-type '(unsigned-byte 8))))
    9.28@@ -522,3 +513,25 @@
    9.29           for index from 0
    9.30           do (setf (aref octet-vec index) (ldb (byte 8 (* i 8)) bignum))
    9.31           finally (return octet-vec))))
    9.32+
    9.33+(defun octets-to-integer-le (octet-vec &optional (bytes (length octet-vec)))
    9.34+  (declare (type (simple-array (unsigned-byte 8)) octet-vec))
    9.35+  (loop for i from 0 below bytes
    9.36+        sum (ash (aref octet-vec i) (* 8 i))))
    9.37+
    9.38+(defun integer-to-octets-le (bignum &optional (n-bits (integer-length bignum)))
    9.39+  (let* ((n-bytes (ceiling n-bits 8))
    9.40+         (octet-vec (make-array n-bytes :element-type '(unsigned-byte 8))))
    9.41+    (declare (type (simple-array (unsigned-byte 8)) octet-vec))
    9.42+    (loop for i from 0 below n-bytes
    9.43+          do (setf (aref octet-vec i) (ldb (byte 8 (* i 8)) bignum))
    9.44+          finally (return octet-vec))))
    9.45+
    9.46+(defun read-little-endian (s &optional (bytes 4))
    9.47+  "Read a number in little-endian format from an byte (octet) stream S,
    9.48+the number having BYTES octets (defaulting to 4)."
    9.49+  (loop for i from 0 below bytes
    9.50+        sum (ash (read-byte s) (* 8 i))))
    9.51+
    9.52+(defun write-little-endian (i s &optional (bytes 4))
    9.53+  (write-sequence (integer-to-octets-le i bytes) s))
    10.1--- a/lisp/std/num/float.lisp	Sat Jul 13 00:03:13 2024 -0400
    10.2+++ b/lisp/std/num/float.lisp	Sat Jul 13 18:18:01 2024 -0400
    10.3@@ -1,6 +1,16 @@
    10.4 ;;; std/num/float.lisp --- Floating Point Numbers
    10.5 
    10.6-;;
    10.7+;; IEEE 754 Floating Point encoding and decoding.
    10.8+
    10.9+;;; Commentary:
   10.10+
   10.11+;; This package provides default encoders for float32 and float64 as defined
   10.12+;; by IEEE 754.
   10.13+
   10.14+;; Note that the physical encoding is always represented as a fixnum.
   10.15+
   10.16+;; To read/write from a file you must pass through a fixnum repr to bytes,
   10.17+;; usually using octets-to-integer or integer-to-octets. There are also
   10.18 
   10.19 ;;; Code:
   10.20 
   10.21@@ -11,6 +21,7 @@
   10.22 
   10.23 (in-package :std/num)
   10.24 (declaim (optimize (speed 3)))
   10.25+
   10.26 ;; The following macro may look a bit overcomplicated to the casual
   10.27 ;; reader. The main culprit is the fact that NaN and infinity can be
   10.28 ;; optionally included, which adds a bunch of conditional parts.
    11.1--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.2+++ b/lisp/std/num/leb128.lisp	Sat Jul 13 18:18:01 2024 -0400
    11.3@@ -0,0 +1,12 @@
    11.4+;;; leb128.lisp --- Little-Endian Base 128 Variable Encoding
    11.5+
    11.6+;; (U)LEB128 encoders
    11.7+
    11.8+;;; Commentary:
    11.9+
   11.10+;; ref: https://en.wikipedia.org/wiki/LEB128
   11.11+;; opt: https://arxiv.org/abs/1503.07387 VByte
   11.12+;; opt: https://arxiv.org/pdf/1709.08990 VByte streaming
   11.13+
   11.14+;;; Code:
   11.15+(in-package :std/num)
    12.1--- a/lisp/std/pkg.lisp	Sat Jul 13 00:03:13 2024 -0400
    12.2+++ b/lisp/std/pkg.lisp	Sat Jul 13 18:18:01 2024 -0400
    12.3@@ -351,6 +351,10 @@
    12.4    :octet-vector-to-hex-string
    12.5    :octets-to-integer
    12.6    :integer-to-octets
    12.7+   :octets-to-integer-le
    12.8+   :integer-to-octets-le
    12.9+   :read-little-endian
   12.10+   :write-little-endian
   12.11    :hexchar-to-int))
   12.12 
   12.13 (defpkg :std/fmt
    13.1--- a/lisp/std/std.asd	Sat Jul 13 00:03:13 2024 -0400
    13.2+++ b/lisp/std/std.asd	Sat Jul 13 18:18:01 2024 -0400
    13.3@@ -28,7 +28,8 @@
    13.4                (:module "num"
    13.5                 :components
    13.6                 ((:file "float")
    13.7-                 (:file "parse")))
    13.8+                 (:file "parse")
    13.9+                 (:file "leb128")))
   13.10                (:file "stream")
   13.11                (:module "fu"
   13.12                 :components
    14.1--- a/lisp/std/task.lisp	Sat Jul 13 00:03:13 2024 -0400
    14.2+++ b/lisp/std/task.lisp	Sat Jul 13 18:18:01 2024 -0400
    14.3@@ -69,7 +69,7 @@
    14.4 (defvar *default-worker-name* "worker")
    14.5 
    14.6 (defclass worker ()
    14.7-  ((thread :initform (%make-thread #.#1=(symbol-name (gensym "w")) t (make-semaphore :name #.#1#))
    14.8+  ((thread :initform (sb-thread::%make-thread #.#1=(symbol-name (gensym "w")) t (make-semaphore :name #.#1#))
    14.9            :accessor worker-thread
   14.10            :initarg :thread)
   14.11    (function :type function :accessor worker-function :initarg :function)