1.1--- a/lisp/lib/dat/parquet/gen.lisp Mon Jul 15 21:59:45 2024 -0400
1.2+++ b/lisp/lib/dat/parquet/gen.lisp Tue Jul 16 21:52:09 2024 -0400
1.3@@ -49,37 +49,55 @@
1.4 (substitute #\- #\_ string)))
1.5
1.6 (labels ((parse-type-id (type-id)
1.7- (string-case (type-id :default nil)
1.8- ("bool" 'boolean)
1.9- ("byte" 'signed-byte)
1.10- ("i16" '(signed-byte 16))
1.11- ("i32" '(signed-byte 32))
1.12- ("i64" '(signed-byte 64))
1.13- ("double" 'double-float)
1.14- ("string" 'string)
1.15- ("list" 'list)
1.16- ("binary" 'octet-vector)
1.17- ("set" 'list)))
1.18+ (when type-id
1.19+ (string-case (type-id :default nil)
1.20+ ("bool" 'boolean)
1.21+ ("byte" 'signed-byte)
1.22+ ("i16" '(signed-byte 16))
1.23+ ("i32" '(signed-byte 32))
1.24+ ("i64" '(signed-byte 64))
1.25+ ("double" 'double-float)
1.26+ ("string" 'string)
1.27+ ("list" 'vector)
1.28+ ("binary" 'octet-vector)
1.29+ ("set" 'vector)
1.30+ ("enum" '(signed-byte 32))
1.31+ ("union" 'union)
1.32+ ("struct" 'struct))))
1.33+ (%intern (name)
1.34+ (if (stringp name)
1.35+ (intern
1.36+ (cond
1.37+ ((equal name "UUIDType") "PARQUET-UUID-TYPE")
1.38+ (t (concatenate 'string
1.39+ "PARQUET-"
1.40+ (camelcase-name-to-lisp-name name))))
1.41+ :dat/parquet)
1.42+ name))
1.43 (parse-type (o)
1.44- (let ((name (string-case ((json-getf o "typeId"))
1.45- ("union" (json-getf o "class"))
1.46- ("struct" (json-getf o "class"))
1.47- ("enum" (json-getf o "class")))))
1.48- (intern
1.49- (cond
1.50- ((equal name "UUIDType") "PARQUET-UUID-TYPE")
1.51- (t (concatenate 'string
1.52- "PARQUET-"
1.53- (camelcase-name-to-lisp-name name))))
1.54- :dat/parquet))))
1.55+ (when o
1.56+ (string-case ((json-getf o "typeId"))
1.57+ ("union" (%intern (json-getf o "class")))
1.58+ ("list"
1.59+ (if-let ((elt (json-getf o "elemType" nil)))
1.60+ (%intern (parse-type elt))
1.61+ (parse-type-id (json-getf o "elemTypeId"))))
1.62+ ("set"
1.63+ (if-let ((elt (json-getf o "elemType" nil)))
1.64+ (%intern (parse-type elt))
1.65+ (parse-type-id (json-getf o "elemTypeId"))))
1.66+ ("struct" (%intern (json-getf o "class")))
1.67+ ("enum" (%intern (json-getf o "class")))))))
1.68 (defun convert-parquet-struct-field-type (field) ;; technically part of thrift type system
1.69- (let* ((type-id (parquet-struct-field-type-id field))
1.70- (type (parquet-struct-field-type field))
1.71- (required (parquet-struct-field-required field))
1.72- (unit-type (or (when type-id (parse-type-id type-id)) (when type (parse-type type)))))
1.73- (if (and (equal "optional" required) (not (equal unit-type 'list))) ;; (listp nil) = t
1.74- `(or null ,unit-type)
1.75- unit-type))))
1.76+ (let* ((type-id (parse-type-id (parquet-struct-field-type-id field)))
1.77+ (type (parse-type (parquet-struct-field-type field)))
1.78+ (required (parquet-struct-field-required field)))
1.79+ (let ((ret (cond
1.80+ ((eql 'vector type-id) `(vector ,type))
1.81+ (t (or type type-id)))))
1.82+ (if (equal "optional" required)
1.83+ `(or null ,ret)
1.84+ ret)))))
1.85
1.86 (defun parquet-json-enums ()
1.87 (list
1.88@@ -130,7 +148,6 @@
1.89 (setq *parquet-structs* (parquet-json-structs)))
1.90
1.91 ;;; CLOS
1.92-(defclass parquet-object () ())
1.93
1.94 ;; (defmethod print-object ((obj parquet-object) stream)
1.95 ;; "Output a Parquet object to a stream."
1.96@@ -138,7 +155,7 @@
1.97
1.98 (defmacro define-parquet-class (name superclasses slots &rest options)
1.99 "Define a new subclass of PARQUET-OBJECT with NAME."
1.100- `(defclass ,name ,@(if-let ((s superclasses)) (list s) `((parquet-object))) ,slots ,@options))
1.101+ `(defclass ,name ,@(if-let ((s superclasses)) (list s) `((dat/parquet::parquet-object))) ,slots ,@options))
1.102
1.103 ;;; Codegen
1.104
1.105@@ -148,16 +165,18 @@
1.106 "Define all known values in *PARQUET-STRUCTS* using DEFINE-PARQUET-CLASS (DEFCLASS)."
1.107 (loop for struct in *parquet-structs*
1.108 unless (null struct)
1.109- collect (let ((name (parquet-struct-name struct))
1.110- (doc (parquet-struct-doc struct))
1.111- (fields (parquet-struct-fields struct)))
1.112- `(define-parquet-class ,(intern (cond
1.113- ((equal name "UUIDType") "PARQUET-UUID-TYPE")
1.114- (t (concatenate 'string
1.115- "PARQUET-"
1.116- (camelcase-name-to-lisp-name name))))
1.117- :dat/parquet)
1.118- (parquet-struct-object)
1.119+ collect (let* ((name (parquet-struct-name struct))
1.120+ (doc (parquet-struct-doc struct))
1.121+ (fields (parquet-struct-fields struct))
1.122+ (class-name (intern (cond
1.123+ ((equal name "UUIDType") "PARQUET-UUID-TYPE")
1.124+ (t (concatenate 'string
1.125+ "PARQUET-"
1.126+ (camelcase-name-to-lisp-name name))))
1.127+ :dat/parquet)))
1.128+ `(progn
1.129+ (define-parquet-class ,class-name
1.130+ (dat/parquet::parquet-struct-object)
1.131 (,@(mapcar (lambda (f)
1.132 (let ((fdoc (parquet-struct-field-doc f))
1.133 (fname (snakecase-name-to-lisp-name
1.134@@ -171,7 +190,8 @@
1.135 ,@(when-let ((ty (convert-parquet-struct-field-type f)))
1.136 `(:type ,ty)))))
1.137 fields))
1.138- ,@(when doc `((:documentation ,doc))))))))
1.139+ ,@(when doc `((:documentation ,doc))))
1.140+ ',class-name)))))
1.141
1.142 (defmacro define-parquet-structs ()
1.143 `(list
1.144@@ -196,8 +216,7 @@
1.145 (defun load-parquet (&key (file *parquet-json-file*))
1.146 (init-parquet-json file)
1.147 (with-package (:dat/parquet)
1.148- (define-parquet-class parquet-struct-object () ())
1.149 (let ((types (define-parquet-types)))
1.150 (export types))
1.151- (export (mapcar 'class-name (define-parquet-structs)))
1.152+ (export (define-parquet-structs))
1.153 (export *parquet-enums*)))
2.1--- a/lisp/lib/dat/parquet/obj.lisp Mon Jul 15 21:59:45 2024 -0400
2.2+++ b/lisp/lib/dat/parquet/obj.lisp Tue Jul 16 21:52:09 2024 -0400
2.3@@ -5,6 +5,10 @@
2.4 ;;; Code:
2.5 (in-package :dat/parquet)
2.6
2.7+(defclass parquet-object () ())
2.8+
2.9+(defclass parquet-struct-object (parquet-object) ())
2.10+
2.11 (eval-always
2.12 (dat/parquet/gen::load-parquet))
2.13
2.14@@ -22,3 +26,4 @@
2.15 (def-parquet-type parquet-type)
2.16 (def-parquet-type parquet-converted-type)
2.17 (def-parquet-type parquet-page-type))
2.18+
3.1--- a/lisp/lib/dat/parquet/tcompact.lisp Mon Jul 15 21:59:45 2024 -0400
3.2+++ b/lisp/lib/dat/parquet/tcompact.lisp Tue Jul 16 21:52:09 2024 -0400
3.3@@ -15,8 +15,12 @@
3.4
3.5 ;;; Protocol
3.6
3.7-(defgeneric tcompact-field-id (self))
3.8-(defgeneric tcompact-element-id (self))
3.9+(defclass thrift-object (id) ())
3.10+
3.11+(defgeneric thrift-element-type (self)
3.12+ (:method ((self parquet-struct-object)) :struct))
3.13+
3.14+(defgeneric thrift-object-length (self))
3.15
3.16 ;;; Integers
3.17
3.18@@ -35,7 +39,7 @@
3.19 (logxor (ash n 1) (ash n -63)))
3.20
3.21 (defun zagzig (n)
3.22- (declare (fixnum n))
3.23+ (declare (integer n))
3.24 (logxor (ash n -1) (- (logand n 1))))
3.25
3.26 (defun tcompact-encode-integer (n &optional (size 8))
3.27@@ -44,8 +48,6 @@
3.28 (vector n)
3.29 (encode-uleb128 (zigzag n) size)))
3.30
3.31-;; (zagzig (zigzag -3)) ;=-3
3.32-
3.33 ;;; Enums
3.34
3.35 ;; ordinal value encoded as int32
3.36@@ -73,6 +75,15 @@
3.37 (defun tcompact-encode-string (string)
3.38 (sb-ext:string-to-octets string :external-format :utf-8))
3.39
3.40+;;; Double
3.41+(defun tcompact-encode-double (float)
3.42+ (tcompact-encode-integer (encode-float32 float)))
3.43+
3.44+;;; Boolean
3.45+
3.46+(defun tcompact-encode-boolean (bool)
3.47+ (if bool 1 0))
3.48+
3.49 ;;; UUID
3.50
3.51 ;; always 16 bytes, no length header
3.52@@ -120,7 +131,8 @@
3.53
3.54 (defvar *tcompact-field-types*
3.55 #(:true :false :i8 :i16 :i32 :i64 :double :binary :list :set :map :struct :uuid))
3.56-(defun tcompact-field-type-id (n) (1+ (aref *tcompact-field-types* n)))
3.57+(defun tcompact-field-type-id* (n) (1+ (aref *tcompact-field-types* n)))
3.58+(defun tcompact-field-type-id (k) (1+ (position k *tcompact-field-types*)))
3.59
3.60 ;; (ldb (byte 4 0) n)
3.61 (defun tcompact-encode-field-header-short (id-delta type-id)
3.62@@ -131,13 +143,17 @@
3.63 (tcompact-encode-integer id))
3.64
3.65 (defun tcompact-encode-field-header (field)
3.66- (concatenate 'octet-vector
3.67- (tcompact-encode-field-header-short 0 (tcompact-field-type-id field))
3.68- (tcompact-encode-field-id (tcompact-field-id field))))
3.69+ (let ((ret (make-array 5 :element-type '(unsigned-byte 8) :fill-pointer 0)))
3.70+ (vector-push (tcompact-encode-field-header-short 0 (tcompact-field-type-id* field))
3.71+ ret)
3.72+ (loop for x across (tcompact-encode-field-id (id field))
3.73+ do (vector-push x ret)
3.74+ finally (return ret))))
3.75
3.76 (defun tcompact-encode-field-value (field))
3.77
3.78 (defun tcompact-encode-struct (struct))
3.79+
3.80 ;; field-id-delta = current-field-id - previous-field-id
3.81
3.82 ;;; List and Set
3.83@@ -155,13 +171,26 @@
3.84 |#
3.85
3.86 (deftype tcompact-element-type-id () '(unsigned-byte 4))
3.87+;; tcompact short size = [0,14]
3.88
3.89 (defvar *tcompact-element-types*
3.90 #(:bool :i8 :i16 :i32 :i64 :double :binary :list :set :map :struct :uuid))
3.91
3.92-(defun tcompact-element-type-id (n) (+ (aref *tcompact-element-types* n) 2))
3.93+(defun tcompact-element-type-id* (n) (+ (aref *tcompact-element-types* n) 2))
3.94+(defun tcompact-element-type-id (k) (+ (position k *tcompact-element-types*) 2))
3.95+
3.96+(defun tcompact-encode-list-header-short (size elt-type)
3.97+ (dpb elt-type (byte 4 4)
3.98+ (dpb size (byte 4 0) 0)))
3.99
3.100-(defun tcompact-encode-list ())
3.101+(defun tcompact-encode-list-header (list)
3.102+ (let ((ret (make-array 5 :element-type '(unsigned-byte 8) :fill-pointer 0)))
3.103+ (vector-push (tcompact-encode-list-header-short #xf (id list)) ret)
3.104+ (loop for x across (tcompact-encode-integer (thrift-object-length list) 4)
3.105+ do (vector-push x ret)
3.106+ finally (return ret))))
3.107+
3.108+(defun tcompact-encode-list-element (type value))
3.109
3.110 ;;; Map
3.111
4.1--- a/lisp/lib/dat/tests.lisp Mon Jul 15 21:59:45 2024 -0400
4.2+++ b/lisp/lib/dat/tests.lisp Tue Jul 16 21:52:09 2024 -0400
4.3@@ -1,5 +1,5 @@
4.4 (defpackage :dat/tests
4.5- (:use :cl :std :rt :dat))
4.6+ (:use :cl :std :rt :dat :log))
4.7
4.8 (in-package :dat/tests)
4.9
4.10@@ -148,7 +148,7 @@
4.11
4.12 (defparameter *parquet-test-file*
4.13 (probe-file
4.14- (merge-pathnames "../../../.stash/datapage_v1-uncompressed-checksum.parquet"
4.15+ (merge-pathnames "../../../.stash/alltypes_plain.parquet"
4.16 #.(asdf:system-source-directory :dat/tests))))
4.17 ;; see also: https://github.com/apache/parquet-testing/blob/master/data/README.md
4.18 (deftest parquet-basic ()
4.19@@ -157,6 +157,6 @@
4.20 (let ((footer (dat/parquet::parquet-read-footer st)))
4.21 (is (typep footer
4.22 'dat/parquet::parquet-file-meta-data))
4.23- (print (slot-value footer 'dat/parquet::schema))
4.24- (print (file-position st))
4.25- (print (file-length st))))))
4.26+ (trace! (slot-value footer 'dat/parquet::schema))
4.27+ (trace! (file-position st))
4.28+ (trace! (file-length st))))))
5.1--- a/lisp/std/num/leb128.lisp Mon Jul 15 21:59:45 2024 -0400
5.2+++ b/lisp/std/num/leb128.lisp Tue Jul 16 21:52:09 2024 -0400
5.3@@ -67,10 +67,10 @@
5.4 (return-from decode-leb128 (values (logior result (the fixnum (ash (lognot 0) shift))) counter))
5.5 (return-from decode-leb128 (values result counter)))))))
5.6
5.7-(declaim (ftype (function (fixnum &optional (unsigned-byte 8)) (simple-array (unsigned-byte 8))) encode-uleb128))
5.8+(declaim (ftype (function (integer &optional (unsigned-byte 8)) (array (unsigned-byte 8))) encode-uleb128))
5.9 (defun encode-uleb128 (int &optional size)
5.10 "Encode an integer INT as a ULEB128 byte array with SIZE (in bytes)."
5.11- (declare (fixnum int))
5.12+ (declare (integer int))
5.13 (let ((more t) (curr) (in 0) (ret (make-array
5.14 (if size
5.15 size
5.16@@ -88,15 +88,15 @@
5.17 (incf in))
5.18 ret))
5.19
5.20-(declaim (ftype (function ((vector unsigned-byte) &optional fixnum) fixnum) decode-uleb128))
5.21+(declaim (ftype (function ((vector unsigned-byte) &optional t) integer) decode-uleb128))
5.22 (defun decode-uleb128 (bits &optional (start 0))
5.23 "Decode an unsigned integer from ULEB128 byte array."
5.24 (let ((result 0) (shift 0) (curr) (counter 0))
5.25- (declare (fixnum result shift counter start))
5.26+ (declare (fixnum shift counter))
5.27 (loop do
5.28 (setf curr (aref bits start))
5.29 (setf start (+ 1 start))
5.30- (setf result (logior result (the fixnum (ash (logand curr #x7f) shift))))
5.31+ (setf result (logior result (ash (logand curr #x7f) shift)))
5.32 (setf shift (+ 7 shift))
5.33 (incf counter)
5.34 (when (= 0 (logand curr #x80))
5.35@@ -110,10 +110,10 @@
5.36 (when (not (= start 0))
5.37 (loop for i from 0 upto start do (read-byte s)))
5.38 (let ((result 0) (shift 0) (curr) (counter 0))
5.39- (declare (fixnum result shift counter))
5.40+ (declare (fixnum shift counter))
5.41 (loop do
5.42 (setf curr (read-byte s))
5.43- (setf result (logior result (the fixnum (ash (logand curr #x7f) shift))))
5.44+ (setf result (logior result (ash (logand curr #x7f) shift)))
5.45 (setf shift (+ 7 shift))
5.46 (incf counter)
5.47 (when (= 0 (logand curr #x80))
6.1--- a/skelfile Mon Jul 15 21:59:45 2024 -0400
6.2+++ b/skelfile Tue Jul 16 21:52:09 2024 -0400
6.3@@ -39,6 +39,8 @@
6.4 (parquet.json ()
6.5 (net/fetch:download "https://packy.compiler.company/data/parquet.json"
6.6 :output ".stash/parquet.json"))
6.7+ (parquet-test-data () (net/fetch:download "https://packy.compiler.company/data/test/alltypes_plain.parquet"
6.8+ :output ".stash/alltypes_plain.parquet"))
6.9 ;; lisp
6.10 (%stash () #$mkdir -pv .stash$#)
6.11 (rdb (%stash) (with-sbcl (:noinform t :quit t)