# HG changeset patch # User Richard Westhaver # Date 1720838037 14400 # Node ID 10c4bb778030c0695ff0e8d3031f55ceca4617ad # Parent bd49b7e2c6235e71d6c01b2fd5e6233239c81a7b bit of parquet refactoring, properly generate slot types diff -r bd49b7e2c623 -r 10c4bb778030 lisp/lib/dat/dat.asd --- a/lisp/lib/dat/dat.asd Fri Jul 12 19:57:18 2024 -0400 +++ b/lisp/lib/dat/dat.asd Fri Jul 12 22:33:57 2024 -0400 @@ -11,7 +11,8 @@ (:file "json") (:module "parquet" :components - ((:file "gen") + ((:file "pkg") + (:file "gen") (:file "parquet"))) (:module "xml" :components diff -r bd49b7e2c623 -r 10c4bb778030 lisp/lib/dat/parquet/gen.lisp --- a/lisp/lib/dat/parquet/gen.lisp Fri Jul 12 19:57:18 2024 -0400 +++ b/lisp/lib/dat/parquet/gen.lisp Fri Jul 12 22:33:57 2024 -0400 @@ -23,12 +23,12 @@ (defmacro def-parquet-enum (sym name) `(progn - (defun ,(symbolicate 'parquet-json- sym) () + (defun ,(symbolicate "PARQUET-JSON-" sym) () (mapcar (lambda (x) (json-getf x "name")) (parquet-json-enum-getf ,name))) (defparameter ,(intern (concatenate 'string "*PARQUET-" (symbol-name sym) "*") :dat/parquet) - (,(symbolicate 'parquet-json- sym))))) + (,(symbolicate "PARQUET-JSON-" sym))))) (defun camelcase-name-to-lisp-name (string) (string-upcase @@ -43,6 +43,37 @@ (string-upcase (substitute #\- #\_ string))) +(labels ((parse-type-id (type-id) + (string-case (type-id :default nil) + ("bool" 'boolean) + ("byte" 'signed-byte) + ("i16" '(signed-byte 16)) + ("i32" '(signed-byte 32)) + ("i64" '(signed-byte 64)) + ("double" 'double-float) + ("string" 'string) + ("list" 'list) + ("binary" 'octet-vector) + ("set" 'list))) + (parse-type (o) + (intern + (concatenate 'string + "PARQUET-" + (camelcase-name-to-lisp-name + (string-case ((json-getf o "typeId")) + ("union" (json-getf o "class")) + ("struct" (json-getf o "class")) + ("enum" (json-getf o "class"))))) + :dat/parquet))) + (defun convert-parquet-struct-field-type (field) ;; technically part of thrift type system + (let* ((type-id (parquet-struct-field-type-id field)) + (type (parquet-struct-field-type field)) + (required (parquet-struct-field-required field)) + (unit-type (or (when type-id (parse-type-id type-id)) (when type (parse-type type))))) + (if (and (equal "optional" required) (not (equal unit-type 'list))) ;; (listp nil) = t + `(or null ,unit-type) + unit-type)))) + (defun parquet-json-enums () (list (def-parquet-enum types "Type") @@ -77,14 +108,7 @@ (parquet-struct-exceptionp struct) (mapcar #'parquet-destruct-field (parquet-struct-fields struct)))) -(flet ((pq-type-parse (o) (let ((id (json-getf o "typeId"))) - (string-case (id :default (warn 'simple-warning :format-control "unknown typeId: ~A" - :format-arguments (list id))) - ("list" (cons id (json-getf o "elemTypeId"))) - ("union" (cons id (json-getf o "class"))) - ("struct" (cons id (json-getf o "class"))) - ("enum" (cons id (json-getf o "class"))))))) - (defun parquet-json-structs () ;; name doc isException isUnion fields +(defun parquet-json-structs () ;; name doc isException isUnion fields (mapcar (lambda (s) (let ((name (json-getf s "name")) @@ -97,13 +121,12 @@ (name (json-getf f "name")) (type-id (json-getf f "typeId")) ;; json object - needs additional parsing - (type (when-let ((ty (json-getf f "type"))) - (pq-type-parse ty))) + (type (json-getf f "type")) (doc (json-getf f "doc")) (required (json-getf f "required"))) (make-parquet-struct-field key name type-id type doc required))))) (make-parquet-struct name doc exceptionp unionp fields))) - (json-getf *parquet-json* "structs")))) + (json-getf *parquet-json* "structs"))) (defun parquet-json-namespaces () (json-getf *parquet-json* "namespaces")) @@ -124,40 +147,42 @@ "Define a new subclass of PARQUET-OBJECT with NAME." `(defclass ,name ,@(if-let ((s superclasses)) (list s) `((parquet-object))) ,slots ,@options)) -(define-parquet-class dat/parquet:parquet-enum-object () ()) -(define-parquet-class dat/parquet:parquet-struct-object () ()) - ;;; Codegen ;; 8) -(defun %define-parquet-structs () - "Define all known values in *PARQUET-STRUCTS* using DEFINE-PARQUET-CLASS (DEFCLASS)." - (loop for struct in *parquet-structs* - unless (null struct) - collect (let ((name (parquet-struct-name struct)) - (doc (parquet-struct-doc struct)) - (fields (parquet-struct-fields struct))) - `(define-parquet-class ,(intern (concatenate 'string - "PARQUET-" - (camelcase-name-to-lisp-name name)) - :dat/parquet) - (parquet-struct-object) - (,@(mapcar (lambda (f) - (let ((fdoc (parquet-struct-field-doc f)) - (fname (snakecase-name-to-lisp-name - (parquet-struct-field-name f)))) - `(,(symbolicate fname) - ,@(when fdoc `(:documentation ,fdoc)) - :initarg ,(keywordicate fname) - ;; TODO 2024-07-12: - ,@(when (equal "optional" (parquet-struct-field-required f)) - `(:initform nil))))) - fields)) - ,@(when doc `((:documentation ,doc))))))) +(eval-always + (defun %define-parquet-structs () + "Define all known values in *PARQUET-STRUCTS* using DEFINE-PARQUET-CLASS (DEFCLASS)." + (loop for struct in *parquet-structs* + unless (null struct) + collect (let ((name (parquet-struct-name struct)) + (doc (parquet-struct-doc struct)) + (fields (parquet-struct-fields struct))) + `(define-parquet-class ,(intern (cond + ((equal name "UUIDType") "PARQUET-UUID-TYPE") + (t (concatenate 'string + "PARQUET-" + (camelcase-name-to-lisp-name name)))) + :dat/parquet) + (parquet-struct-object) + (,@(mapcar (lambda (f) + (let ((fdoc (parquet-struct-field-doc f)) + (fname (snakecase-name-to-lisp-name + (parquet-struct-field-name f)))) + `(,(intern fname :dat/parquet) + ,@(when fdoc `(:documentation ,fdoc)) + :initarg ,(keywordicate fname) + ;; TODO 2024-07-12: + ,@(when (equal "optional" (parquet-struct-field-required f)) + `(:initform nil)) + ,@(when-let ((ty (convert-parquet-struct-field-type f))) + `(:type ,ty))))) + fields)) + ,@(when doc `((:documentation ,doc)))))))) (defmacro define-parquet-structs () `(list - ,@(%define-parquet-structs))) + ,@(%define-parquet-structs))) (defmacro define-parquet-type (name opts &body body) "Define a parquet type with DEFTYPE which maps to LISP-TYPE." @@ -178,6 +203,8 @@ (defun load-parquet (&key (file *parquet-json-file*)) (init-parquet-json file) (with-package (:dat/parquet) + (define-parquet-class parquet-enum-object () ()) + (define-parquet-class parquet-struct-object () ()) (export (define-parquet-types)) (export (mapcar 'class-name (define-parquet-structs))) (export *parquet-enums*))) diff -r bd49b7e2c623 -r 10c4bb778030 lisp/lib/dat/parquet/parquet.lisp --- a/lisp/lib/dat/parquet/parquet.lisp Fri Jul 12 19:57:18 2024 -0400 +++ b/lisp/lib/dat/parquet/parquet.lisp Fri Jul 12 22:33:57 2024 -0400 @@ -10,6 +10,28 @@ https://github.com/apache/parquet-testing https://github.com/apache/parquet-java https://github.com/apache/arrow-rs + +https://thrift.apache.org/docs/types +|# + +#| + 4-byte magic number "PAR1" + + + ... + + + + ... + + ... + + + ... + + File Metadata + 4-byte length in bytes of file metadata (little endian) + 4-byte magic number "PAR1" |# ;; In this package we're being as lazy as possible. To generate our own @@ -21,12 +43,13 @@ ;; ;;; Code: (in-package :dat/parquet) -(eval-when (:compile-toplevel) - (load-parquet)) +(eval-always + (dat/parquet/gen::load-parquet)) (defgeneric parquet-read (value &optional stream)) (defgeneric parquet-write (value &optional stream)) +;; HACK 2024-07-12: (define-bitfield parquet-compression-codec (uncompressed boolean) (snappy boolean) @@ -38,6 +61,21 @@ (lz4-raw boolean)) ;;; Read/Write +(define-constant +parquet-magic-number+ "PAR1" :test 'equal) + +(defconstant +default-parquet-page-size+ (* 8 1024)) ;; 8kb +(defconstant +default-parquet-row-group-size (expt 1024 3)) ;; 1gb + +(defun parquet-write-magic (stream) + (write-string +parquet-magic-number+ stream)) + +(defun parquet-read-magic (stream) + (assert (char= #.(char +parquet-magic-number+ 0) (read-char stream))) + (assert (char= #.(char +parquet-magic-number+ 1) (read-char stream))) + (assert (char= #.(char +parquet-magic-number+ 2) (read-char stream))) + (assert (char= #.(char +parquet-magic-number+ 3) (read-char stream))) + t) + (defmethod parquet-write ((value (eql t)) &optional stream) "Encode a parquet boolean true value." (declare (ignore value)) diff -r bd49b7e2c623 -r 10c4bb778030 lisp/lib/dat/parquet/pkg.lisp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lisp/lib/dat/parquet/pkg.lisp Fri Jul 12 22:33:57 2024 -0400 @@ -0,0 +1,10 @@ +;;; pkg.lisp --- Parquet Packages + +;; + +;;; Code: +(in-package :dat/parquet) + +(defpackage :dat/parquet/gen + (:use :cl :std :dat/proto :dat/json) + (:export :load-parquet)) diff -r bd49b7e2c623 -r 10c4bb778030 lisp/lib/dat/pkg.lisp --- a/lisp/lib/dat/pkg.lisp Fri Jul 12 19:57:18 2024 -0400 +++ b/lisp/lib/dat/pkg.lisp Fri Jul 12 22:33:57 2024 -0400 @@ -233,12 +233,8 @@ (:use :cl :std :dat/proto) (:export)) -(defpackage :dat/parquet/gen - (:use :cl :std :dat/proto :dat/json)) - (defpackage :dat/parquet (:use :cl :std :dat/proto :dat/json) - (:import-from :dat/parquet/gen :load-parquet) (:export :parquet-object :parquet-enum-object diff -r bd49b7e2c623 -r 10c4bb778030 lisp/lib/dat/tests.lisp --- a/lisp/lib/dat/tests.lisp Fri Jul 12 19:57:18 2024 -0400 +++ b/lisp/lib/dat/tests.lisp Fri Jul 12 22:33:57 2024 -0400 @@ -145,3 +145,11 @@ (read-sxp-stream f s)) (with-output-to-string (s) (is (write-sxp-stream f s))))) + +(deftest parquet-basic () + (is + (with-input-from-string + (s + (with-output-to-string (s) + (dat/parquet::parquet-write-magic s))) + (dat/parquet::parquet-read-magic s))))