changelog shortlog graph tags branches files raw help

Mercurial > core / changeset: more work on tcompact/thrift, fixed type info in parquet-struct-objects

changeset 550: 4d34907c69eb
parent 549: 32bd859533b3
child 551: 83b71948b92c
author: Richard Westhaver <ellis@rwest.io>
date: Tue, 16 Jul 2024 21:52:09 -0400
files: lisp/lib/dat/parquet/gen.lisp lisp/lib/dat/parquet/obj.lisp lisp/lib/dat/parquet/tcompact.lisp lisp/lib/dat/tests.lisp lisp/std/num/leb128.lisp skelfile
description: more work on tcompact/thrift, fixed type info in parquet-struct-objects
     1.1--- a/lisp/lib/dat/parquet/gen.lisp	Mon Jul 15 21:59:45 2024 -0400
     1.2+++ b/lisp/lib/dat/parquet/gen.lisp	Tue Jul 16 21:52:09 2024 -0400
     1.3@@ -49,37 +49,55 @@
     1.4    (substitute #\- #\_ string)))
     1.5 
     1.6 (labels ((parse-type-id (type-id)
     1.7-           (string-case (type-id :default nil)
     1.8-             ("bool" 'boolean)
     1.9-             ("byte" 'signed-byte)
    1.10-             ("i16" '(signed-byte 16))
    1.11-             ("i32" '(signed-byte 32))
    1.12-             ("i64" '(signed-byte 64))
    1.13-             ("double" 'double-float)
    1.14-             ("string" 'string)
    1.15-             ("list" 'list)
    1.16-             ("binary" 'octet-vector)
    1.17-             ("set" 'list)))
    1.18+           (when type-id
    1.19+             (string-case (type-id :default nil)
    1.20+               ("bool" 'boolean)
    1.21+               ("byte" 'signed-byte)
    1.22+               ("i16" '(signed-byte 16))
    1.23+               ("i32" '(signed-byte 32))
    1.24+               ("i64" '(signed-byte 64))
    1.25+               ("double" 'double-float)
    1.26+               ("string" 'string)
    1.27+               ("list" 'vector)
    1.28+               ("binary" 'octet-vector)
    1.29+               ("set" 'vector)
    1.30+               ("enum" '(signed-byte 32))
    1.31+               ("union" 'union)
    1.32+               ("struct" 'struct))))
    1.33+         (%intern (name)
    1.34+           (if (stringp name)
    1.35+               (intern
    1.36+                (cond 
    1.37+                  ((equal name "UUIDType") "PARQUET-UUID-TYPE")
    1.38+                  (t (concatenate 'string
    1.39+                                  "PARQUET-"
    1.40+                                  (camelcase-name-to-lisp-name name))))
    1.41+                :dat/parquet)
    1.42+               name))
    1.43          (parse-type (o)
    1.44-           (let ((name (string-case ((json-getf o "typeId"))
    1.45-                         ("union" (json-getf o "class"))
    1.46-                         ("struct" (json-getf o "class"))
    1.47-                         ("enum" (json-getf o "class")))))
    1.48-             (intern
    1.49-              (cond 
    1.50-                ((equal name "UUIDType") "PARQUET-UUID-TYPE")
    1.51-                (t (concatenate 'string
    1.52-                         "PARQUET-"
    1.53-                         (camelcase-name-to-lisp-name name))))
    1.54-              :dat/parquet))))
    1.55+           (when o
    1.56+             (string-case ((json-getf o "typeId"))
    1.57+               ("union" (%intern (json-getf o "class")))
    1.58+               ("list"
    1.59+                (if-let ((elt (json-getf o "elemType" nil)))
    1.60+                  (%intern (parse-type elt))
    1.61+                  (parse-type-id (json-getf o "elemTypeId"))))
    1.62+               ("set"
    1.63+                (if-let ((elt (json-getf o "elemType" nil)))
    1.64+                  (%intern (parse-type elt))
    1.65+                  (parse-type-id (json-getf o "elemTypeId"))))
    1.66+               ("struct" (%intern (json-getf o "class")))
    1.67+               ("enum" (%intern (json-getf o "class")))))))
    1.68   (defun convert-parquet-struct-field-type (field) ;; technically part of thrift type system
    1.69-    (let* ((type-id (parquet-struct-field-type-id field))
    1.70-           (type (parquet-struct-field-type field))
    1.71-           (required (parquet-struct-field-required field))
    1.72-           (unit-type (or (when type-id (parse-type-id type-id)) (when type (parse-type type)))))
    1.73-      (if (and (equal "optional" required) (not (equal unit-type 'list))) ;; (listp nil) = t
    1.74-          `(or null ,unit-type)
    1.75-          unit-type))))
    1.76+    (let* ((type-id (parse-type-id (parquet-struct-field-type-id field)))
    1.77+           (type (parse-type (parquet-struct-field-type field)))
    1.78+           (required (parquet-struct-field-required field)))
    1.79+          (let ((ret (cond
    1.80+                       ((eql 'vector type-id) `(vector ,type))
    1.81+                       (t (or type type-id)))))
    1.82+            (if (equal "optional" required)
    1.83+                `(or null ,ret)
    1.84+                ret)))))
    1.85 
    1.86 (defun parquet-json-enums ()
    1.87   (list
    1.88@@ -130,7 +148,6 @@
    1.89   (setq *parquet-structs* (parquet-json-structs)))
    1.90 
    1.91 ;;; CLOS
    1.92-(defclass parquet-object () ())
    1.93 
    1.94 ;; (defmethod print-object ((obj parquet-object) stream)
    1.95 ;;   "Output a Parquet object to a stream."
    1.96@@ -138,7 +155,7 @@
    1.97 
    1.98 (defmacro define-parquet-class (name superclasses slots &rest options)
    1.99   "Define a new subclass of PARQUET-OBJECT with NAME."
   1.100-  `(defclass ,name ,@(if-let ((s superclasses)) (list s) `((parquet-object))) ,slots ,@options))
   1.101+  `(defclass ,name ,@(if-let ((s superclasses)) (list s) `((dat/parquet::parquet-object))) ,slots ,@options))
   1.102 
   1.103 ;;; Codegen
   1.104 
   1.105@@ -148,16 +165,18 @@
   1.106     "Define all known values in *PARQUET-STRUCTS* using DEFINE-PARQUET-CLASS (DEFCLASS)."
   1.107     (loop for struct in *parquet-structs*
   1.108           unless (null struct)
   1.109-          collect (let ((name (parquet-struct-name struct))
   1.110-                        (doc (parquet-struct-doc struct))
   1.111-                        (fields (parquet-struct-fields struct)))
   1.112-                    `(define-parquet-class ,(intern (cond
   1.113-                                                      ((equal name "UUIDType") "PARQUET-UUID-TYPE")
   1.114-                                                      (t (concatenate 'string
   1.115-                                                                      "PARQUET-"
   1.116-                                                                      (camelcase-name-to-lisp-name name))))
   1.117-                                                    :dat/parquet)
   1.118-                         (parquet-struct-object)
   1.119+          collect (let* ((name (parquet-struct-name struct))
   1.120+                         (doc (parquet-struct-doc struct))
   1.121+                         (fields (parquet-struct-fields struct))
   1.122+                         (class-name (intern (cond
   1.123+                                               ((equal name "UUIDType") "PARQUET-UUID-TYPE")
   1.124+                                               (t (concatenate 'string
   1.125+                                                               "PARQUET-"
   1.126+                                                               (camelcase-name-to-lisp-name name))))
   1.127+                                             :dat/parquet)))
   1.128+                    `(progn
   1.129+                       (define-parquet-class ,class-name
   1.130+                         (dat/parquet::parquet-struct-object)
   1.131                        (,@(mapcar (lambda (f)
   1.132                                     (let ((fdoc (parquet-struct-field-doc f))
   1.133                                           (fname (snakecase-name-to-lisp-name
   1.134@@ -171,7 +190,8 @@
   1.135                                         ,@(when-let ((ty (convert-parquet-struct-field-type f)))
   1.136                                             `(:type ,ty)))))
   1.137                                   fields))
   1.138-                       ,@(when doc `((:documentation ,doc))))))))
   1.139+                       ,@(when doc `((:documentation ,doc))))
   1.140+                       ',class-name)))))
   1.141 
   1.142 (defmacro define-parquet-structs ()
   1.143   `(list
   1.144@@ -196,8 +216,7 @@
   1.145 (defun load-parquet (&key (file *parquet-json-file*))
   1.146   (init-parquet-json file)
   1.147   (with-package (:dat/parquet)
   1.148-    (define-parquet-class parquet-struct-object () ())
   1.149     (let ((types (define-parquet-types)))
   1.150       (export types))
   1.151-    (export (mapcar 'class-name (define-parquet-structs)))
   1.152+    (export (define-parquet-structs))
   1.153     (export *parquet-enums*)))
     2.1--- a/lisp/lib/dat/parquet/obj.lisp	Mon Jul 15 21:59:45 2024 -0400
     2.2+++ b/lisp/lib/dat/parquet/obj.lisp	Tue Jul 16 21:52:09 2024 -0400
     2.3@@ -5,6 +5,10 @@
     2.4 ;;; Code:
     2.5 (in-package :dat/parquet)
     2.6 
     2.7+(defclass parquet-object () ())
     2.8+
     2.9+(defclass parquet-struct-object (parquet-object) ())
    2.10+
    2.11 (eval-always
    2.12   (dat/parquet/gen::load-parquet))
    2.13 
    2.14@@ -22,3 +26,4 @@
    2.15   (def-parquet-type parquet-type)
    2.16   (def-parquet-type parquet-converted-type)
    2.17   (def-parquet-type parquet-page-type))
    2.18+
     3.1--- a/lisp/lib/dat/parquet/tcompact.lisp	Mon Jul 15 21:59:45 2024 -0400
     3.2+++ b/lisp/lib/dat/parquet/tcompact.lisp	Tue Jul 16 21:52:09 2024 -0400
     3.3@@ -15,8 +15,12 @@
     3.4 
     3.5 ;;; Protocol
     3.6 
     3.7-(defgeneric tcompact-field-id (self))
     3.8-(defgeneric tcompact-element-id (self))
     3.9+(defclass thrift-object (id) ())
    3.10+
    3.11+(defgeneric thrift-element-type (self)
    3.12+  (:method ((self parquet-struct-object)) :struct))
    3.13+
    3.14+(defgeneric thrift-object-length (self))
    3.15 
    3.16 ;;; Integers
    3.17 
    3.18@@ -35,7 +39,7 @@
    3.19   (logxor (ash n 1) (ash n -63)))
    3.20 
    3.21 (defun zagzig (n)
    3.22-  (declare (fixnum n))
    3.23+  (declare (integer n))
    3.24   (logxor (ash n -1) (- (logand n 1))))
    3.25 
    3.26 (defun tcompact-encode-integer (n &optional (size 8))
    3.27@@ -44,8 +48,6 @@
    3.28       (vector n)
    3.29       (encode-uleb128 (zigzag n) size)))
    3.30 
    3.31-;; (zagzig (zigzag -3)) ;=-3
    3.32-
    3.33 ;;; Enums
    3.34 
    3.35 ;; ordinal value encoded as int32
    3.36@@ -73,6 +75,15 @@
    3.37 (defun tcompact-encode-string (string)
    3.38   (sb-ext:string-to-octets string :external-format :utf-8))
    3.39 
    3.40+;;; Double
    3.41+(defun tcompact-encode-double (float)
    3.42+  (tcompact-encode-integer (encode-float32 float)))
    3.43+
    3.44+;;; Boolean
    3.45+
    3.46+(defun tcompact-encode-boolean (bool)
    3.47+  (if bool 1 0))
    3.48+
    3.49 ;;; UUID
    3.50 
    3.51 ;; always 16 bytes, no length header
    3.52@@ -120,7 +131,8 @@
    3.53 
    3.54 (defvar *tcompact-field-types*
    3.55   #(:true :false :i8 :i16 :i32 :i64 :double :binary :list :set :map :struct :uuid))
    3.56-(defun tcompact-field-type-id (n) (1+ (aref *tcompact-field-types* n)))
    3.57+(defun tcompact-field-type-id* (n) (1+ (aref *tcompact-field-types* n)))
    3.58+(defun tcompact-field-type-id (k) (1+ (position k *tcompact-field-types*)))
    3.59 
    3.60 ;; (ldb (byte 4 0) n)
    3.61 (defun tcompact-encode-field-header-short (id-delta type-id)
    3.62@@ -131,13 +143,17 @@
    3.63   (tcompact-encode-integer id))
    3.64 
    3.65 (defun tcompact-encode-field-header (field)
    3.66-  (concatenate 'octet-vector
    3.67-               (tcompact-encode-field-header-short 0 (tcompact-field-type-id field))
    3.68-               (tcompact-encode-field-id (tcompact-field-id field))))
    3.69+  (let ((ret (make-array 5 :element-type '(unsigned-byte 8) :fill-pointer 0)))
    3.70+    (vector-push (tcompact-encode-field-header-short 0 (tcompact-field-type-id* field))
    3.71+                 ret)
    3.72+    (loop for x across (tcompact-encode-field-id (id field))
    3.73+          do (vector-push x ret)
    3.74+          finally (return ret))))
    3.75 
    3.76 (defun tcompact-encode-field-value (field))
    3.77 
    3.78 (defun tcompact-encode-struct (struct))
    3.79+
    3.80   ;; field-id-delta = current-field-id - previous-field-id
    3.81 
    3.82 ;;; List and Set
    3.83@@ -155,13 +171,26 @@
    3.84 |#
    3.85 
    3.86 (deftype tcompact-element-type-id () '(unsigned-byte 4))
    3.87+;; tcompact short size = [0,14]
    3.88 
    3.89 (defvar *tcompact-element-types*
    3.90   #(:bool :i8 :i16 :i32 :i64 :double :binary :list :set :map :struct :uuid))
    3.91 
    3.92-(defun tcompact-element-type-id (n) (+ (aref *tcompact-element-types* n) 2))
    3.93+(defun tcompact-element-type-id* (n) (+ (aref *tcompact-element-types* n) 2))
    3.94+(defun tcompact-element-type-id (k) (+ (position k *tcompact-element-types*) 2))
    3.95+
    3.96+(defun tcompact-encode-list-header-short (size elt-type)
    3.97+  (dpb elt-type (byte 4 4)
    3.98+       (dpb size (byte 4 0) 0)))
    3.99 
   3.100-(defun tcompact-encode-list ())
   3.101+(defun tcompact-encode-list-header (list)
   3.102+  (let ((ret (make-array 5 :element-type '(unsigned-byte 8) :fill-pointer 0)))
   3.103+    (vector-push (tcompact-encode-list-header-short #xf (id list)) ret)
   3.104+    (loop for x across (tcompact-encode-integer (thrift-object-length list) 4)
   3.105+          do (vector-push x ret)
   3.106+          finally (return ret))))
   3.107+
   3.108+(defun tcompact-encode-list-element (type value))
   3.109 
   3.110 ;;; Map
   3.111 
     4.1--- a/lisp/lib/dat/tests.lisp	Mon Jul 15 21:59:45 2024 -0400
     4.2+++ b/lisp/lib/dat/tests.lisp	Tue Jul 16 21:52:09 2024 -0400
     4.3@@ -1,5 +1,5 @@
     4.4 (defpackage :dat/tests
     4.5-  (:use :cl :std :rt :dat))
     4.6+  (:use :cl :std :rt :dat :log))
     4.7 
     4.8 (in-package :dat/tests)
     4.9 
    4.10@@ -148,7 +148,7 @@
    4.11 
    4.12 (defparameter *parquet-test-file*
    4.13   (probe-file
    4.14-   (merge-pathnames "../../../.stash/datapage_v1-uncompressed-checksum.parquet"
    4.15+   (merge-pathnames "../../../.stash/alltypes_plain.parquet"
    4.16                     #.(asdf:system-source-directory :dat/tests))))
    4.17 ;; see also: https://github.com/apache/parquet-testing/blob/master/data/README.md
    4.18 (deftest parquet-basic ()
    4.19@@ -157,6 +157,6 @@
    4.20       (let ((footer (dat/parquet::parquet-read-footer st)))
    4.21         (is (typep footer
    4.22                    'dat/parquet::parquet-file-meta-data))
    4.23-        (print (slot-value footer 'dat/parquet::schema))
    4.24-        (print (file-position st))
    4.25-        (print (file-length st))))))
    4.26+        (trace! (slot-value footer 'dat/parquet::schema))
    4.27+        (trace! (file-position st))
    4.28+        (trace! (file-length st))))))
     5.1--- a/lisp/std/num/leb128.lisp	Mon Jul 15 21:59:45 2024 -0400
     5.2+++ b/lisp/std/num/leb128.lisp	Tue Jul 16 21:52:09 2024 -0400
     5.3@@ -67,10 +67,10 @@
     5.4                (return-from decode-leb128 (values (logior result (the fixnum (ash (lognot 0) shift))) counter))
     5.5                (return-from decode-leb128 (values result counter)))))))
     5.6 
     5.7-(declaim (ftype (function (fixnum &optional (unsigned-byte 8)) (simple-array (unsigned-byte 8))) encode-uleb128))
     5.8+(declaim (ftype (function (integer &optional (unsigned-byte 8)) (array (unsigned-byte 8))) encode-uleb128))
     5.9 (defun encode-uleb128 (int &optional size)
    5.10   "Encode an integer INT as a ULEB128 byte array with SIZE (in bytes)."
    5.11-  (declare (fixnum int))
    5.12+  (declare (integer int))
    5.13   (let ((more t) (curr) (in 0) (ret (make-array
    5.14                                      (if size
    5.15                                          size
    5.16@@ -88,15 +88,15 @@
    5.17          (incf in))
    5.18     ret))
    5.19 
    5.20-(declaim (ftype (function ((vector unsigned-byte) &optional fixnum) fixnum) decode-uleb128))
    5.21+(declaim (ftype (function ((vector unsigned-byte) &optional t) integer) decode-uleb128))
    5.22 (defun decode-uleb128 (bits &optional (start 0))
    5.23   "Decode an unsigned integer from ULEB128 byte array."
    5.24   (let ((result 0) (shift 0) (curr) (counter 0))
    5.25-    (declare (fixnum result shift counter start))
    5.26+    (declare (fixnum shift counter))
    5.27     (loop do 
    5.28          (setf curr (aref bits start))
    5.29          (setf start (+ 1 start))
    5.30-         (setf result (logior result (the fixnum (ash (logand curr #x7f) shift))))
    5.31+         (setf result (logior result (ash (logand curr #x7f) shift)))
    5.32          (setf shift (+ 7 shift))
    5.33          (incf counter)
    5.34          (when (= 0 (logand curr #x80))
    5.35@@ -110,10 +110,10 @@
    5.36   (when (not (= start 0))
    5.37     (loop for i from 0 upto start do (read-byte s)))
    5.38   (let ((result 0) (shift 0) (curr) (counter 0))
    5.39-    (declare (fixnum result shift counter))
    5.40+    (declare (fixnum shift counter))
    5.41     (loop do 
    5.42          (setf curr (read-byte s))
    5.43-         (setf result (logior result (the fixnum (ash (logand curr #x7f) shift))))
    5.44+         (setf result (logior result (ash (logand curr #x7f) shift)))
    5.45          (setf shift (+ 7 shift))
    5.46          (incf counter)
    5.47          (when (= 0 (logand curr #x80))
     6.1--- a/skelfile	Mon Jul 15 21:59:45 2024 -0400
     6.2+++ b/skelfile	Tue Jul 16 21:52:09 2024 -0400
     6.3@@ -39,6 +39,8 @@
     6.4  (parquet.json ()
     6.5                (net/fetch:download "https://packy.compiler.company/data/parquet.json"
     6.6                                    :output ".stash/parquet.json"))
     6.7+ (parquet-test-data () (net/fetch:download "https://packy.compiler.company/data/test/alltypes_plain.parquet"
     6.8+                                           :output ".stash/alltypes_plain.parquet"))
     6.9  ;; lisp
    6.10  (%stash () #$mkdir -pv .stash$#)
    6.11  (rdb (%stash) (with-sbcl (:noinform t :quit t)