1.1--- a/lisp/lib/dat/dat.asd Mon Jul 15 15:18:03 2024 -0400
1.2+++ b/lisp/lib/dat/dat.asd Mon Jul 15 20:59:04 2024 -0400
1.3@@ -14,6 +14,7 @@
1.4 ((:file "gen")
1.5 (:file "pkg")
1.6 (:file "obj")
1.7+ (:file "tcompact")
1.8 (:file "io")
1.9 (:file "rle")
1.10 (:file "proto")))
2.1--- a/lisp/lib/dat/parquet/gen.lisp Mon Jul 15 15:18:03 2024 -0400
2.2+++ b/lisp/lib/dat/parquet/gen.lisp Mon Jul 15 20:59:04 2024 -0400
2.3@@ -100,21 +100,6 @@
2.4 (:constructor make-parquet-struct-field (key name type-id type doc required)))
2.5 key name type-id type doc required)
2.6
2.7-(defun parquet-destruct-field (field)
2.8- (list (parquet-struct-field-name field)
2.9- (parquet-struct-field-key field)
2.10- (parquet-struct-field-doc field)
2.11- (parquet-struct-field-type-id field)
2.12- (parquet-struct-field-type field)
2.13- (parquet-struct-field-required field)))
2.14-
2.15-(defun parquet-destruct (struct)
2.16- (list (parquet-struct-name struct)
2.17- (parquet-struct-doc struct)
2.18- (parquet-struct-unionp struct)
2.19- (parquet-struct-exceptionp struct)
2.20- (mapcar #'parquet-destruct-field (parquet-struct-fields struct))))
2.21-
2.22 (defun parquet-json-structs () ;; name doc isException isUnion fields
2.23 (mapcar
2.24 (lambda (s)
2.25@@ -212,12 +197,6 @@
2.26 (with-package (:dat/parquet)
2.27 (define-parquet-class parquet-struct-object () ())
2.28 (let ((types (define-parquet-types)))
2.29- (export types)
2.30- (deftype dat/parquet::parquet-type (&optional (designator octet-vector) optional)
2.31- (if optional
2.32- (if (eql designator 'list)
2.33- list
2.34- `(or null ,designator))
2.35- designator)))
2.36+ (export types))
2.37 (export (mapcar 'class-name (define-parquet-structs)))
2.38 (export *parquet-enums*)))
3.1--- a/lisp/lib/dat/parquet/io.lisp Mon Jul 15 15:18:03 2024 -0400
3.2+++ b/lisp/lib/dat/parquet/io.lisp Mon Jul 15 20:59:04 2024 -0400
3.3@@ -1,6 +1,6 @@
3.4 ;;; io.lisp --- Parquet IO
3.5
3.6-;;
3.7+;;
3.8
3.9 ;;; Code:
3.10 (in-package :dat/parquet)
3.11@@ -10,9 +10,47 @@
3.12 (write-string +parquet-magic-number+ stream))
3.13
3.14 (defun parquet-read-magic (stream)
3.15- (assert (char= #.(char +parquet-magic-number+ 0) (read-char stream)))
3.16- (assert (char= #.(char +parquet-magic-number+ 1) (read-char stream)))
3.17- (assert (char= #.(char +parquet-magic-number+ 2) (read-char stream)))
3.18- (assert (char= #.(char +parquet-magic-number+ 3) (read-char stream)))
3.19- t)
3.20+ (assert (= #.(char-code (aref +parquet-magic-number+ 0)) (read-byte stream)))
3.21+ (assert (= #.(char-code (aref +parquet-magic-number+ 1)) (read-byte stream)))
3.22+ (assert (= #.(char-code (aref +parquet-magic-number+ 2)) (read-byte stream)))
3.23+ (assert (= #.(char-code (aref +parquet-magic-number+ 3)) (read-byte stream))))
3.24+
3.25+(defun parquet-read-unsigned (stream)
3.26+ (read-uleb128 stream))
3.27+
3.28+(defun parquet-read-signed (stream)
3.29+ (read-leb128 stream))
3.30+
3.31+(defun parquet-read-boolean (stream)
3.32+ (ecase (read-byte stream)
3.33+ (0 nil)
3.34+ (1 t)))
3.35
3.36+(defun parquet-file-stream-p (stream)
3.37+ "Assert the start and end of a file STREAM are the parquet magic bytes."
3.38+ (parquet-read-magic stream)
3.39+ ;; set position to end - 4
3.40+ (file-position stream (- (the fixnum (file-length stream)) 4))
3.41+ (parquet-read-magic stream))
3.42+
3.43+(defun parquet-read-schema-element (stream))
3.44+(defun parquet-read-schema (stream)
3.45+ "Read a parquet-schema which is repeated list of parquet-schema-element."
3.46+
3.47+ )
3.48+
3.49+(defun parquet-read-file-meta-data (stream)
3.50+ "Read a parquet-file-meta-data object from STREAM."
3.51+ ;; version
3.52+ (make-instance 'parquet-file-meta-data
3.53+ :version (parquet-read-signed stream)
3.54+ :schema (list (parquet-read-signed stream))))
3.55+
3.56+(defun parquet-read-footer (stream)
3.57+ "Read the footer of parquet data in STREAM."
3.58+ (parquet-file-stream-p stream)
3.59+ ;; set file-position, read metadata length and magic
3.60+ (file-position stream (- (file-length stream) 8))
3.61+ (let ((len (parquet-read-unsigned stream)))
3.62+ (file-position stream (- (file-length stream) 8 len))
3.63+ (parquet-read-file-meta-data stream)))
4.1--- a/lisp/lib/dat/parquet/obj.lisp Mon Jul 15 15:18:03 2024 -0400
4.2+++ b/lisp/lib/dat/parquet/obj.lisp Mon Jul 15 20:59:04 2024 -0400
4.3@@ -8,16 +8,17 @@
4.4 (eval-always
4.5 (dat/parquet/gen::load-parquet))
4.6
4.7-(deftype parquet-compression-codec () `(member ,*parquet-compression-codecs*))
4.8-
4.9-(deftype parquet-boundary-order () `(member ,*parquet-boundary-orders*))
4.10-
4.11-(deftype parquet-encoding () `(member ,*parquet-encodings*))
4.12-
4.13-(deftype parquet-field-repetition () `(member ,*parquet-field-repetition-types*))
4.14-
4.15-(deftype parquet-type-designator () `(member ,*parquet-types*))
4.16-
4.17-(deftype parquet-converted-type-designator () `(member ,*parquet-converted-types*))
4.18-
4.19-(deftype parquet-page-type () `(member ,*parquet-page-types*))
4.20+(macrolet ((def-parquet-type (name)
4.21+ (let ((var-name (symbolicate "*" name "S*"))
4.22+ (name1 (symbolicate name "*")))
4.23+ `(progn
4.24+ (deftype ,name () `(member ,,var-name))
4.25+ (defun ,name (d) (position d ,var-name :test 'eql))
4.26+ (defun ,name1 (n) (elt ,var-name n))))))
4.27+ (def-parquet-type parquet-compression-codec)
4.28+ (def-parquet-type parquet-boundary-order)
4.29+ (def-parquet-type parquet-encoding)
4.30+ (def-parquet-type parquet-field-repetition-type)
4.31+ (def-parquet-type parquet-type)
4.32+ (def-parquet-type parquet-converted-type)
4.33+ (def-parquet-type parquet-page-type))
5.1--- /dev/null Thu Jan 01 00:00:00 1970 +0000
5.2+++ b/lisp/lib/dat/parquet/tcompact.lisp Mon Jul 15 20:59:04 2024 -0400
5.3@@ -0,0 +1,184 @@
5.4+;;; tcompact.lisp --- Thrift Compact Protocol
5.5+
5.6+;; ref: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
5.7+
5.8+;;; Commentary:
5.9+
5.10+;; in order to encode Parquet, we need to be able to encode the Thrift Compact
5.11+;; Protocol (TCompact). All thrift structures we've generated via parquet.json
5.12+;; are serialized using TCompact.
5.13+
5.14+;; see also: https://thrift.apache.org/static/files/thrift-20070401.pdf
5.15+
5.16+;;; Code:
5.17+(in-package :dat/parquet)
5.18+
5.19+;;; Protocol
5.20+
5.21+(defgeneric tcompact-field-id (self))
5.22+(defgeneric tcompact-element-id (self))
5.23+
5.24+;;; Integers
5.25+
5.26+#|
5.27+50399 = 11000100 11011111 (LSB)
5.28+ = 0000011 0001001 1011111 (7-bit groups)
5.29+ = 00000011 10001001 11011111 (add continuation bits)
5.30+ = 0x03 0x89 0xDF (hex)
5.31+→ 0xDF 0x89 0x03 (write to ram LSB first)
5.32+|#
5.33+
5.34+;; encoded as ULEB128. signed and unsigned bytes are encoded as single
5.35+;; bytes. all others are coverted to int64.
5.36+(defun zigzag (n)
5.37+ (declare (integer n))
5.38+ (logxor (ash n 1) (ash n -63)))
5.39+
5.40+(defun zagzig (n)
5.41+ (declare (fixnum n))
5.42+ (logxor (ash n -1) (- (logand n 1))))
5.43+
5.44+(defun tcompact-encode-integer (n &optional (size 8))
5.45+ (declare (integer n))
5.46+ (if (<= (integer-length n) 8)
5.47+ (vector n)
5.48+ (encode-uleb128 (zigzag n) size)))
5.49+
5.50+;; (zagzig (zigzag -3)) ;=-3
5.51+
5.52+;;; Enums
5.53+
5.54+;; ordinal value encoded as int32
5.55+(defun tcompact-encode-enum (n)
5.56+ (tcompact-encode-integer n 4))
5.57+
5.58+;;; Binary
5.59+
5.60+#|
5.61+Binary protocol, binary data, 1+ bytes:
5.62++--------+...+--------+--------+...+--------+
5.63+| byte length | bytes |
5.64++--------+...+--------+--------+...+--------+
5.65+|#
5.66+
5.67+;; a varint followed by the bytes
5.68+(defun tcompact-encode-octet-vector (octets)
5.69+ (concatenate 'octet-vector
5.70+ (tcompact-encode-integer (length octets))
5.71+ octets))
5.72+
5.73+;;; String
5.74+
5.75+;; encoded as UTF-8 bytes without null-termination
5.76+(defun tcompact-encode-string (string)
5.77+ (sb-ext:string-to-octets string :external-format :utf-8))
5.78+
5.79+;;; UUID
5.80+
5.81+;; always 16 bytes, no length header
5.82+(defun tcompact-encode-uuid (uuid)
5.83+ (declare (obj/uuid:uuid uuid))
5.84+ (obj/uuid:uuid-to-octet-vector uuid))
5.85+
5.86+;;; Structs
5.87+
5.88+;; struct ::= ( field-header field-value )* stop-field
5.89+;; field-header ::= field-type field-id
5.90+
5.91+#|
5.92+Compact protocol field header (short form) and field value:
5.93++--------+--------+...+--------+
5.94+|ddddtttt| field value |
5.95++--------+--------+...+--------+
5.96+
5.97+Compact protocol field header (1 to 3 bytes, long form) and field value:
5.98++--------+--------+...+--------+--------+...+--------+
5.99+|0000tttt| field id | field value |
5.100++--------+--------+...+--------+--------+...+--------+
5.101+
5.102+Compact protocol stop field:
5.103++--------+
5.104+|00000000|
5.105++--------+
5.106+|#
5.107+
5.108+;; sequences of zero or more 'fields' followed by a stop field.
5.109+
5.110+;; each field starts with a field header and is followed by the encoded field
5.111+;; value.
5.112+
5.113+;; the field-id is represented in Lisp via OBJ/ID.
5.114+
5.115+;; note that it is possible to handle unknown fields while decoding. in the
5.116+;; usual case these are ignored.
5.117+
5.118+(declaim ((unsigned-byte 8) +tcompact-stop-field+))
5.119+(defconstant +tcompact-stop-field+ 0)
5.120+(deftype tcompact-field-id () '(integer 0 32767))
5.121+(deftype tcompact-field-id-delta () '(unsigned-byte 4))
5.122+(deftype tcompact-field-type-id () '(unsigned-byte 4))
5.123+
5.124+(defvar *tcompact-field-types*
5.125+ #(:true :false :i8 :i16 :i32 :i64 :double :binary :list :set :map :struct :uuid))
5.126+(defun tcompact-field-type-id (n) (1+ (aref *tcompact-field-types* n)))
5.127+
5.128+;; (ldb (byte 4 0) n)
5.129+(defun tcompact-encode-field-header-short (id-delta type-id)
5.130+ (dpb type-id (byte 4 4)
5.131+ (dpb id-delta (byte 4 0) 0)))
5.132+
5.133+(defun tcompact-encode-field-id (id)
5.134+ (tcompact-encode-integer id))
5.135+
5.136+(defun tcompact-encode-field-header (field)
5.137+ (concatenate 'octet-vector
5.138+ (tcompact-encode-field-header-short 0 (tcompact-field-type-id field))
5.139+ (tocmpact-encode-field-id (tcompact-field-id field))))
5.140+
5.141+(defun tcompact-encode-field-value (field))
5.142+
5.143+(defun tcompact-encode-struct (struct))
5.144+ ;; field-id-delta = current-field-id - previous-field-id
5.145+
5.146+;;; List and Set
5.147+
5.148+#|
5.149+Compact protocol list header (1 byte, short form) and elements:
5.150++--------+--------+...+--------+
5.151+|sssstttt| elements |
5.152++--------+--------+...+--------+
5.153+
5.154+Compact protocol list header (2+ bytes, long form) and elements:
5.155++--------+--------+...+--------+--------+...+--------+
5.156+|1111tttt| size | elements |
5.157++--------+--------+...+--------+--------+...+--------+
5.158+|#
5.159+
5.160+(deftype tcompact-element-type-id () '(unsigned-byte 4))
5.161+
5.162+(defvar *tcompact-element-types*
5.163+ #(:bool :i8 :i16 :i32 :i64 :double :binary :list :set :map :struct :uuid))
5.164+
5.165+(defun tcompact-element-type-id (n) (+ (aref *tcompact-element-types* n) 2))
5.166+
5.167+(defun tcompact-encode-list ())
5.168+
5.169+;;; Map
5.170+
5.171+;; map ::= empty-map | non-empty-map
5.172+;; empty-map ::= `0`
5.173+;; non-empty-map ::= size key-element-type value-element-type (key value)+
5.174+
5.175+#|
5.176+Compact protocol map header (1 byte, empty map):
5.177++--------+
5.178+|00000000|
5.179++--------+
5.180+
5.181+Compact protocol map header (2+ bytes, non empty map) and key value pairs:
5.182++--------+...+--------+--------+--------+...+--------+
5.183+| size |kkkkvvvv| key value pairs |
5.184++--------+...+--------+--------+--------+...+--------+
5.185+|#
5.186+
5.187+(defun tcompact-encode-map ())
6.1--- a/lisp/lib/dat/pkg.lisp Mon Jul 15 15:18:03 2024 -0400
6.2+++ b/lisp/lib/dat/pkg.lisp Mon Jul 15 20:59:04 2024 -0400
6.3@@ -234,7 +234,7 @@
6.4 (:export))
6.5
6.6 (defpackage :dat/parquet
6.7- (:use :cl :std :dat/proto :dat/json)
6.8+ (:use :cl :std :obj/id :dat/proto :dat/json)
6.9 (:export
6.10 :parquet-object
6.11 :parquet-enum-object
7.1--- a/lisp/lib/dat/tests.lisp Mon Jul 15 15:18:03 2024 -0400
7.2+++ b/lisp/lib/dat/tests.lisp Mon Jul 15 20:59:04 2024 -0400
7.3@@ -146,10 +146,12 @@
7.4 (with-output-to-string (s)
7.5 (is (write-sxp-stream f s)))))
7.6
7.7+;; see also: https://github.com/apache/parquet-testing/blob/master/data/README.md
7.8 (deftest parquet-basic ()
7.9- (is
7.10- (with-input-from-string
7.11- (s
7.12- (with-output-to-string (s)
7.13- (dat/parquet::parquet-write-magic s)))
7.14- (dat/parquet::parquet-read-magic s))))
7.15+ (with-open-file (st "/home/ellis/comp/core/.stash/datapage_v1-uncompressed-checksum.parquet" :element-type '(unsigned-byte 8))
7.16+ (let ((footer (dat/parquet::parquet-read-footer st)))
7.17+ (is (typep footer
7.18+ 'dat/parquet::parquet-file-meta-data))
7.19+ (print (slot-value footer 'dat/parquet::schema))
7.20+ (print (file-position st))
7.21+ (print (file-length st)))))
8.1--- a/lisp/std/num/float.lisp Mon Jul 15 15:18:03 2024 -0400
8.2+++ b/lisp/std/num/float.lisp Mon Jul 15 20:59:04 2024 -0400
8.3@@ -10,7 +10,7 @@
8.4 ;; Note that the physical encoding is always represented as a fixnum.
8.5
8.6 ;; To read/write from a file you must pass through a fixnum repr to bytes,
8.7-;; usually using octets-to-integer or integer-to-octets. There are also
8.8+;; usually using octets-to-integer or integer-to-octets.
8.9
8.10 ;;; Code:
8.11
9.1--- a/lisp/std/num/leb128.lisp Mon Jul 15 15:18:03 2024 -0400
9.2+++ b/lisp/std/num/leb128.lisp Mon Jul 15 20:59:04 2024 -0400
9.3@@ -17,7 +17,6 @@
9.4 "Encode an integer of arbitrary length into a leb128 unsigned-8 buffer"
9.5 (let ((more t) (curr) (in 0) (int (make-array
9.6 4
9.7- :adjustable t
9.8 :fill-pointer 0
9.9 :element-type '(unsigned-byte 8)))) ;(neg (< i 0))
9.10 (declare (fixnum i in))
9.11@@ -68,14 +67,16 @@
9.12 (return-from decode-leb128 (values (logior result (the fixnum (ash (lognot 0) shift))) counter))
9.13 (return-from decode-leb128 (values result counter)))))))
9.14
9.15-(declaim (ftype (function (fixnum) (simple-array (unsigned-byte 8))) encode-uleb128))
9.16-(defun encode-uleb128 (int)
9.17- "Encode an integer INT as a ULEB128 byte array."
9.18+(declaim (ftype (function (fixnum &optional (unsigned-byte 8)) (simple-array (unsigned-byte 8))) encode-uleb128))
9.19+(defun encode-uleb128 (int &optional size)
9.20+ "Encode an integer INT as a ULEB128 byte array with SIZE (in bytes)."
9.21 (declare (fixnum int))
9.22 (let ((more t) (curr) (in 0) (ret (make-array
9.23- (if (zerop int)
9.24- 1
9.25- (ceiling (/ (log (+ int 1) 2) 7)))
9.26+ (if size
9.27+ size
9.28+ (if (zerop int)
9.29+ 1
9.30+ (ceiling (/ (log (+ int 1) 2) 7))))
9.31 :element-type '(unsigned-byte 8)))) ;(neg (< int 0))
9.32 (loop while more do
9.33 (setf curr (logand int #x7f))
9.34@@ -85,14 +86,13 @@
9.35 (setf curr (logior curr #x80)))
9.36 (setf (aref ret in) curr)
9.37 (incf in))
9.38- (coerce ret 'simple-array)))
9.39+ ret))
9.40
9.41-(declaim (ftype (function ((array (unsigned-byte 8)) &optional t) fixnum) decode-uleb128))
9.42+(declaim (ftype (function ((vector unsigned-byte) &optional fixnum) fixnum) decode-uleb128))
9.43 (defun decode-uleb128 (bits &optional (start 0))
9.44 "Decode an unsigned integer from ULEB128 byte array."
9.45 (let ((result 0) (shift 0) (curr) (counter 0))
9.46- (declare (fixnum result shift counter start)
9.47- ((array (unsigned-byte 8)) bits))
9.48+ (declare (fixnum result shift counter start))
9.49 (loop do
9.50 (setf curr (aref bits start))
9.51 (setf start (+ 1 start))