changelog shortlog graph tags branches files raw help

Mercurial > core / changeset: bit of parquet refactoring, properly generate slot types

changeset 541: 10c4bb778030
parent 540: bd49b7e2c623
child 542: d83b4d25d5c9
author: Richard Westhaver <ellis@rwest.io>
date: Fri, 12 Jul 2024 22:33:57 -0400
files: lisp/lib/dat/dat.asd lisp/lib/dat/parquet/gen.lisp lisp/lib/dat/parquet/parquet.lisp lisp/lib/dat/parquet/pkg.lisp lisp/lib/dat/pkg.lisp lisp/lib/dat/tests.lisp
description: bit of parquet refactoring, properly generate slot types
     1.1--- a/lisp/lib/dat/dat.asd	Fri Jul 12 19:57:18 2024 -0400
     1.2+++ b/lisp/lib/dat/dat.asd	Fri Jul 12 22:33:57 2024 -0400
     1.3@@ -11,7 +11,8 @@
     1.4                (:file "json")
     1.5                (:module "parquet"
     1.6                 :components
     1.7-                ((:file "gen")
     1.8+                ((:file "pkg")
     1.9+                 (:file "gen")
    1.10                  (:file "parquet")))
    1.11                (:module "xml"
    1.12                 :components
     2.1--- a/lisp/lib/dat/parquet/gen.lisp	Fri Jul 12 19:57:18 2024 -0400
     2.2+++ b/lisp/lib/dat/parquet/gen.lisp	Fri Jul 12 22:33:57 2024 -0400
     2.3@@ -23,12 +23,12 @@
     2.4 
     2.5 (defmacro def-parquet-enum (sym name)
     2.6   `(progn
     2.7-     (defun ,(symbolicate 'parquet-json- sym) ()
     2.8+     (defun ,(symbolicate "PARQUET-JSON-" sym) ()
     2.9        (mapcar (lambda (x) (json-getf x "name")) (parquet-json-enum-getf ,name)))
    2.10      (defparameter ,(intern
    2.11                      (concatenate 'string "*PARQUET-" (symbol-name sym) "*")
    2.12                      :dat/parquet)
    2.13-       (,(symbolicate 'parquet-json- sym)))))
    2.14+       (,(symbolicate "PARQUET-JSON-" sym)))))
    2.15 
    2.16 (defun camelcase-name-to-lisp-name (string)
    2.17   (string-upcase
    2.18@@ -43,6 +43,37 @@
    2.19   (string-upcase
    2.20    (substitute #\- #\_ string)))
    2.21 
    2.22+(labels ((parse-type-id (type-id)
    2.23+           (string-case (type-id :default nil)
    2.24+             ("bool" 'boolean)
    2.25+             ("byte" 'signed-byte)
    2.26+             ("i16" '(signed-byte 16))
    2.27+             ("i32" '(signed-byte 32))
    2.28+             ("i64" '(signed-byte 64))
    2.29+             ("double" 'double-float)
    2.30+             ("string" 'string)
    2.31+             ("list" 'list)
    2.32+             ("binary" 'octet-vector)
    2.33+             ("set" 'list)))
    2.34+         (parse-type (o)
    2.35+           (intern
    2.36+            (concatenate 'string
    2.37+                         "PARQUET-"
    2.38+                         (camelcase-name-to-lisp-name
    2.39+                          (string-case ((json-getf o "typeId"))
    2.40+                            ("union" (json-getf o "class"))
    2.41+                            ("struct" (json-getf o "class"))
    2.42+                            ("enum" (json-getf o "class")))))
    2.43+            :dat/parquet)))
    2.44+  (defun convert-parquet-struct-field-type (field) ;; technically part of thrift type system
    2.45+    (let* ((type-id (parquet-struct-field-type-id field))
    2.46+           (type (parquet-struct-field-type field))
    2.47+           (required (parquet-struct-field-required field))
    2.48+           (unit-type (or (when type-id (parse-type-id type-id)) (when type (parse-type type)))))
    2.49+      (if (and (equal "optional" required) (not (equal unit-type 'list))) ;; (listp nil) = t
    2.50+          `(or null ,unit-type)
    2.51+          unit-type))))
    2.52+
    2.53 (defun parquet-json-enums ()
    2.54   (list
    2.55    (def-parquet-enum types "Type")
    2.56@@ -77,14 +108,7 @@
    2.57         (parquet-struct-exceptionp struct)
    2.58         (mapcar #'parquet-destruct-field (parquet-struct-fields struct))))
    2.59 
    2.60-(flet ((pq-type-parse (o) (let ((id (json-getf o "typeId")))
    2.61-                            (string-case (id :default (warn 'simple-warning :format-control "unknown typeId: ~A"
    2.62-                                                                            :format-arguments (list id)))
    2.63-                              ("list" (cons id (json-getf o "elemTypeId")))
    2.64-                              ("union" (cons id (json-getf o "class")))
    2.65-                              ("struct" (cons id (json-getf o "class")))
    2.66-                              ("enum" (cons id (json-getf o "class")))))))
    2.67-  (defun parquet-json-structs () ;; name doc isException isUnion fields
    2.68+(defun parquet-json-structs () ;; name doc isException isUnion fields
    2.69   (mapcar
    2.70    (lambda (s)
    2.71      (let ((name (json-getf s "name"))
    2.72@@ -97,13 +121,12 @@
    2.73                                   (name (json-getf f "name"))
    2.74                                   (type-id (json-getf f "typeId"))
    2.75                                   ;; json object - needs additional parsing
    2.76-                                  (type (when-let ((ty (json-getf f "type")))
    2.77-                                          (pq-type-parse ty)))
    2.78+                                  (type (json-getf f "type"))
    2.79                                   (doc (json-getf f "doc"))
    2.80                                   (required (json-getf f "required")))
    2.81                               (make-parquet-struct-field key name type-id type doc required)))))
    2.82        (make-parquet-struct name doc exceptionp unionp fields)))
    2.83-   (json-getf *parquet-json* "structs"))))
    2.84+   (json-getf *parquet-json* "structs")))
    2.85 
    2.86 (defun parquet-json-namespaces ()
    2.87   (json-getf *parquet-json* "namespaces"))
    2.88@@ -124,40 +147,42 @@
    2.89   "Define a new subclass of PARQUET-OBJECT with NAME."
    2.90   `(defclass ,name ,@(if-let ((s superclasses)) (list s) `((parquet-object))) ,slots ,@options))
    2.91 
    2.92-(define-parquet-class dat/parquet:parquet-enum-object () ())
    2.93-(define-parquet-class dat/parquet:parquet-struct-object () ())
    2.94-
    2.95 ;;; Codegen
    2.96 
    2.97 ;; 8)
    2.98-(defun %define-parquet-structs ()
    2.99-  "Define all known values in *PARQUET-STRUCTS* using DEFINE-PARQUET-CLASS (DEFCLASS)."
   2.100-      (loop for struct in *parquet-structs*
   2.101-            unless (null struct)
   2.102-            collect (let ((name (parquet-struct-name struct))
   2.103-                          (doc (parquet-struct-doc struct))
   2.104-                          (fields (parquet-struct-fields struct)))
   2.105-                      `(define-parquet-class ,(intern (concatenate 'string
   2.106-                                                                   "PARQUET-"
   2.107-                                                                   (camelcase-name-to-lisp-name name))
   2.108-                                                      :dat/parquet)
   2.109-                           (parquet-struct-object)
   2.110-                         (,@(mapcar (lambda (f)
   2.111-                                      (let ((fdoc (parquet-struct-field-doc f))
   2.112-                                            (fname (snakecase-name-to-lisp-name
   2.113-                                                    (parquet-struct-field-name f))))
   2.114-                                        `(,(symbolicate fname)
   2.115-                                          ,@(when fdoc `(:documentation ,fdoc))
   2.116-                                          :initarg ,(keywordicate fname)
   2.117-                                          ;; TODO 2024-07-12: 
   2.118-                                          ,@(when (equal "optional" (parquet-struct-field-required f))
   2.119-                                              `(:initform nil)))))
   2.120-                                    fields))
   2.121-                         ,@(when doc `((:documentation ,doc)))))))
   2.122+(eval-always
   2.123+  (defun %define-parquet-structs ()
   2.124+    "Define all known values in *PARQUET-STRUCTS* using DEFINE-PARQUET-CLASS (DEFCLASS)."
   2.125+    (loop for struct in *parquet-structs*
   2.126+          unless (null struct)
   2.127+          collect (let ((name (parquet-struct-name struct))
   2.128+                        (doc (parquet-struct-doc struct))
   2.129+                        (fields (parquet-struct-fields struct)))
   2.130+                    `(define-parquet-class ,(intern (cond
   2.131+                                                      ((equal name "UUIDType") "PARQUET-UUID-TYPE")
   2.132+                                                      (t (concatenate 'string
   2.133+                                                                      "PARQUET-"
   2.134+                                                                      (camelcase-name-to-lisp-name name))))
   2.135+                                                    :dat/parquet)
   2.136+                         (parquet-struct-object)
   2.137+                       (,@(mapcar (lambda (f)
   2.138+                                    (let ((fdoc (parquet-struct-field-doc f))
   2.139+                                          (fname (snakecase-name-to-lisp-name
   2.140+                                                  (parquet-struct-field-name f))))
   2.141+                                      `(,(intern fname :dat/parquet)
   2.142+                                        ,@(when fdoc `(:documentation ,fdoc))
   2.143+                                        :initarg ,(keywordicate fname)
   2.144+                                        ;; TODO 2024-07-12: 
   2.145+                                        ,@(when (equal "optional" (parquet-struct-field-required f))
   2.146+                                            `(:initform nil))
   2.147+                                        ,@(when-let ((ty (convert-parquet-struct-field-type f)))
   2.148+                                            `(:type ,ty)))))
   2.149+                                  fields))
   2.150+                       ,@(when doc `((:documentation ,doc))))))))
   2.151 
   2.152 (defmacro define-parquet-structs ()
   2.153   `(list
   2.154-     ,@(%define-parquet-structs)))
   2.155+    ,@(%define-parquet-structs)))
   2.156 
   2.157 (defmacro define-parquet-type (name opts &body body)
   2.158   "Define a parquet type with DEFTYPE which maps to LISP-TYPE."
   2.159@@ -178,6 +203,8 @@
   2.160 (defun load-parquet (&key (file *parquet-json-file*))
   2.161   (init-parquet-json file)
   2.162   (with-package (:dat/parquet)
   2.163+    (define-parquet-class parquet-enum-object () ())
   2.164+    (define-parquet-class parquet-struct-object () ())
   2.165     (export (define-parquet-types))
   2.166     (export (mapcar 'class-name (define-parquet-structs)))
   2.167     (export *parquet-enums*)))
     3.1--- a/lisp/lib/dat/parquet/parquet.lisp	Fri Jul 12 19:57:18 2024 -0400
     3.2+++ b/lisp/lib/dat/parquet/parquet.lisp	Fri Jul 12 22:33:57 2024 -0400
     3.3@@ -10,6 +10,28 @@
     3.4 https://github.com/apache/parquet-testing
     3.5 https://github.com/apache/parquet-java
     3.6 https://github.com/apache/arrow-rs
     3.7+
     3.8+https://thrift.apache.org/docs/types
     3.9+|#
    3.10+
    3.11+#|
    3.12+    4-byte magic number "PAR1"
    3.13+    <Column 1 Chunk 1>
    3.14+    <Column 2 Chunk 1>
    3.15+    ...
    3.16+    <Column N Chunk 1>
    3.17+    <Column 1 Chunk 2>
    3.18+    <Column 2 Chunk 2>
    3.19+    ...
    3.20+    <Column N Chunk 2>
    3.21+    ...
    3.22+    <Column 1 Chunk M>
    3.23+    <Column 2 Chunk M>
    3.24+    ...
    3.25+    <Column N Chunk M>
    3.26+    File Metadata
    3.27+    4-byte length in bytes of file metadata (little endian)
    3.28+    4-byte magic number "PAR1"
    3.29 |#
    3.30 
    3.31 ;; In this package we're being as lazy as possible. To generate our own
    3.32@@ -21,12 +43,13 @@
    3.33 ;; 
    3.34 ;;; Code:
    3.35 (in-package :dat/parquet)
    3.36-(eval-when (:compile-toplevel)
    3.37-  (load-parquet))
    3.38+(eval-always
    3.39+  (dat/parquet/gen::load-parquet))
    3.40 
    3.41 (defgeneric parquet-read (value &optional stream))
    3.42 (defgeneric parquet-write (value &optional stream))
    3.43 
    3.44+;;  HACK 2024-07-12: 
    3.45 (define-bitfield parquet-compression-codec
    3.46   (uncompressed boolean)
    3.47   (snappy boolean)
    3.48@@ -38,6 +61,21 @@
    3.49   (lz4-raw boolean))
    3.50 
    3.51 ;;; Read/Write
    3.52+(define-constant +parquet-magic-number+ "PAR1" :test 'equal)
    3.53+
    3.54+(defconstant +default-parquet-page-size+ (* 8 1024)) ;; 8kb
    3.55+(defconstant +default-parquet-row-group-size (expt 1024 3)) ;; 1gb
    3.56+
    3.57+(defun parquet-write-magic (stream)
    3.58+  (write-string +parquet-magic-number+ stream))
    3.59+
    3.60+(defun parquet-read-magic (stream)
    3.61+  (assert (char= #.(char +parquet-magic-number+ 0) (read-char stream)))
    3.62+  (assert (char= #.(char +parquet-magic-number+ 1) (read-char stream)))
    3.63+  (assert (char= #.(char +parquet-magic-number+ 2) (read-char stream)))
    3.64+  (assert (char= #.(char +parquet-magic-number+ 3) (read-char stream)))
    3.65+  t)
    3.66+
    3.67 (defmethod parquet-write ((value (eql t)) &optional stream)
    3.68   "Encode a parquet boolean true value."
    3.69   (declare (ignore value))
     4.1--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2+++ b/lisp/lib/dat/parquet/pkg.lisp	Fri Jul 12 22:33:57 2024 -0400
     4.3@@ -0,0 +1,10 @@
     4.4+;;; pkg.lisp --- Parquet Packages
     4.5+
     4.6+;; 
     4.7+
     4.8+;;; Code:
     4.9+(in-package :dat/parquet)
    4.10+
    4.11+(defpackage :dat/parquet/gen
    4.12+  (:use :cl :std :dat/proto :dat/json)
    4.13+  (:export :load-parquet))
     5.1--- a/lisp/lib/dat/pkg.lisp	Fri Jul 12 19:57:18 2024 -0400
     5.2+++ b/lisp/lib/dat/pkg.lisp	Fri Jul 12 22:33:57 2024 -0400
     5.3@@ -233,12 +233,8 @@
     5.4   (:use :cl :std :dat/proto)
     5.5   (:export))
     5.6 
     5.7-(defpackage :dat/parquet/gen
     5.8-  (:use :cl :std :dat/proto :dat/json))
     5.9-
    5.10 (defpackage :dat/parquet
    5.11   (:use :cl :std :dat/proto :dat/json)
    5.12-  (:import-from :dat/parquet/gen :load-parquet)
    5.13   (:export
    5.14    :parquet-object
    5.15    :parquet-enum-object
     6.1--- a/lisp/lib/dat/tests.lisp	Fri Jul 12 19:57:18 2024 -0400
     6.2+++ b/lisp/lib/dat/tests.lisp	Fri Jul 12 22:33:57 2024 -0400
     6.3@@ -145,3 +145,11 @@
     6.4       (read-sxp-stream f s))
     6.5     (with-output-to-string (s)
     6.6       (is (write-sxp-stream f s)))))
     6.7+
     6.8+(deftest parquet-basic ()
     6.9+  (is
    6.10+   (with-input-from-string
    6.11+       (s
    6.12+        (with-output-to-string (s)
    6.13+          (dat/parquet::parquet-write-magic s)))
    6.14+     (dat/parquet::parquet-read-magic s))))