1.1--- a/lisp/lib/dat/dat.asd Fri Jul 12 19:57:18 2024 -0400
1.2+++ b/lisp/lib/dat/dat.asd Fri Jul 12 22:33:57 2024 -0400
1.3@@ -11,7 +11,8 @@
1.4 (:file "json")
1.5 (:module "parquet"
1.6 :components
1.7- ((:file "gen")
1.8+ ((:file "pkg")
1.9+ (:file "gen")
1.10 (:file "parquet")))
1.11 (:module "xml"
1.12 :components
2.1--- a/lisp/lib/dat/parquet/gen.lisp Fri Jul 12 19:57:18 2024 -0400
2.2+++ b/lisp/lib/dat/parquet/gen.lisp Fri Jul 12 22:33:57 2024 -0400
2.3@@ -23,12 +23,12 @@
2.4
2.5 (defmacro def-parquet-enum (sym name)
2.6 `(progn
2.7- (defun ,(symbolicate 'parquet-json- sym) ()
2.8+ (defun ,(symbolicate "PARQUET-JSON-" sym) ()
2.9 (mapcar (lambda (x) (json-getf x "name")) (parquet-json-enum-getf ,name)))
2.10 (defparameter ,(intern
2.11 (concatenate 'string "*PARQUET-" (symbol-name sym) "*")
2.12 :dat/parquet)
2.13- (,(symbolicate 'parquet-json- sym)))))
2.14+ (,(symbolicate "PARQUET-JSON-" sym)))))
2.15
2.16 (defun camelcase-name-to-lisp-name (string)
2.17 (string-upcase
2.18@@ -43,6 +43,37 @@
2.19 (string-upcase
2.20 (substitute #\- #\_ string)))
2.21
2.22+(labels ((parse-type-id (type-id)
2.23+ (string-case (type-id :default nil)
2.24+ ("bool" 'boolean)
2.25+ ("byte" 'signed-byte)
2.26+ ("i16" '(signed-byte 16))
2.27+ ("i32" '(signed-byte 32))
2.28+ ("i64" '(signed-byte 64))
2.29+ ("double" 'double-float)
2.30+ ("string" 'string)
2.31+ ("list" 'list)
2.32+ ("binary" 'octet-vector)
2.33+ ("set" 'list)))
2.34+ (parse-type (o)
2.35+ (intern
2.36+ (concatenate 'string
2.37+ "PARQUET-"
2.38+ (camelcase-name-to-lisp-name
2.39+ (string-case ((json-getf o "typeId"))
2.40+ ("union" (json-getf o "class"))
2.41+ ("struct" (json-getf o "class"))
2.42+ ("enum" (json-getf o "class")))))
2.43+ :dat/parquet)))
2.44+ (defun convert-parquet-struct-field-type (field) ;; technically part of thrift type system
2.45+ (let* ((type-id (parquet-struct-field-type-id field))
2.46+ (type (parquet-struct-field-type field))
2.47+ (required (parquet-struct-field-required field))
2.48+ (unit-type (or (when type-id (parse-type-id type-id)) (when type (parse-type type)))))
2.49+ (if (and (equal "optional" required) (not (equal unit-type 'list))) ;; (listp nil) = t
2.50+ `(or null ,unit-type)
2.51+ unit-type))))
2.52+
2.53 (defun parquet-json-enums ()
2.54 (list
2.55 (def-parquet-enum types "Type")
2.56@@ -77,14 +108,7 @@
2.57 (parquet-struct-exceptionp struct)
2.58 (mapcar #'parquet-destruct-field (parquet-struct-fields struct))))
2.59
2.60-(flet ((pq-type-parse (o) (let ((id (json-getf o "typeId")))
2.61- (string-case (id :default (warn 'simple-warning :format-control "unknown typeId: ~A"
2.62- :format-arguments (list id)))
2.63- ("list" (cons id (json-getf o "elemTypeId")))
2.64- ("union" (cons id (json-getf o "class")))
2.65- ("struct" (cons id (json-getf o "class")))
2.66- ("enum" (cons id (json-getf o "class")))))))
2.67- (defun parquet-json-structs () ;; name doc isException isUnion fields
2.68+(defun parquet-json-structs () ;; name doc isException isUnion fields
2.69 (mapcar
2.70 (lambda (s)
2.71 (let ((name (json-getf s "name"))
2.72@@ -97,13 +121,12 @@
2.73 (name (json-getf f "name"))
2.74 (type-id (json-getf f "typeId"))
2.75 ;; json object - needs additional parsing
2.76- (type (when-let ((ty (json-getf f "type")))
2.77- (pq-type-parse ty)))
2.78+ (type (json-getf f "type"))
2.79 (doc (json-getf f "doc"))
2.80 (required (json-getf f "required")))
2.81 (make-parquet-struct-field key name type-id type doc required)))))
2.82 (make-parquet-struct name doc exceptionp unionp fields)))
2.83- (json-getf *parquet-json* "structs"))))
2.84+ (json-getf *parquet-json* "structs")))
2.85
2.86 (defun parquet-json-namespaces ()
2.87 (json-getf *parquet-json* "namespaces"))
2.88@@ -124,40 +147,42 @@
2.89 "Define a new subclass of PARQUET-OBJECT with NAME."
2.90 `(defclass ,name ,@(if-let ((s superclasses)) (list s) `((parquet-object))) ,slots ,@options))
2.91
2.92-(define-parquet-class dat/parquet:parquet-enum-object () ())
2.93-(define-parquet-class dat/parquet:parquet-struct-object () ())
2.94-
2.95 ;;; Codegen
2.96
2.97 ;; 8)
2.98-(defun %define-parquet-structs ()
2.99- "Define all known values in *PARQUET-STRUCTS* using DEFINE-PARQUET-CLASS (DEFCLASS)."
2.100- (loop for struct in *parquet-structs*
2.101- unless (null struct)
2.102- collect (let ((name (parquet-struct-name struct))
2.103- (doc (parquet-struct-doc struct))
2.104- (fields (parquet-struct-fields struct)))
2.105- `(define-parquet-class ,(intern (concatenate 'string
2.106- "PARQUET-"
2.107- (camelcase-name-to-lisp-name name))
2.108- :dat/parquet)
2.109- (parquet-struct-object)
2.110- (,@(mapcar (lambda (f)
2.111- (let ((fdoc (parquet-struct-field-doc f))
2.112- (fname (snakecase-name-to-lisp-name
2.113- (parquet-struct-field-name f))))
2.114- `(,(symbolicate fname)
2.115- ,@(when fdoc `(:documentation ,fdoc))
2.116- :initarg ,(keywordicate fname)
2.117- ;; TODO 2024-07-12:
2.118- ,@(when (equal "optional" (parquet-struct-field-required f))
2.119- `(:initform nil)))))
2.120- fields))
2.121- ,@(when doc `((:documentation ,doc)))))))
2.122+(eval-always
2.123+ (defun %define-parquet-structs ()
2.124+ "Define all known values in *PARQUET-STRUCTS* using DEFINE-PARQUET-CLASS (DEFCLASS)."
2.125+ (loop for struct in *parquet-structs*
2.126+ unless (null struct)
2.127+ collect (let ((name (parquet-struct-name struct))
2.128+ (doc (parquet-struct-doc struct))
2.129+ (fields (parquet-struct-fields struct)))
2.130+ `(define-parquet-class ,(intern (cond
2.131+ ((equal name "UUIDType") "PARQUET-UUID-TYPE")
2.132+ (t (concatenate 'string
2.133+ "PARQUET-"
2.134+ (camelcase-name-to-lisp-name name))))
2.135+ :dat/parquet)
2.136+ (parquet-struct-object)
2.137+ (,@(mapcar (lambda (f)
2.138+ (let ((fdoc (parquet-struct-field-doc f))
2.139+ (fname (snakecase-name-to-lisp-name
2.140+ (parquet-struct-field-name f))))
2.141+ `(,(intern fname :dat/parquet)
2.142+ ,@(when fdoc `(:documentation ,fdoc))
2.143+ :initarg ,(keywordicate fname)
2.144+ ;; TODO 2024-07-12:
2.145+ ,@(when (equal "optional" (parquet-struct-field-required f))
2.146+ `(:initform nil))
2.147+ ,@(when-let ((ty (convert-parquet-struct-field-type f)))
2.148+ `(:type ,ty)))))
2.149+ fields))
2.150+ ,@(when doc `((:documentation ,doc))))))))
2.151
2.152 (defmacro define-parquet-structs ()
2.153 `(list
2.154- ,@(%define-parquet-structs)))
2.155+ ,@(%define-parquet-structs)))
2.156
2.157 (defmacro define-parquet-type (name opts &body body)
2.158 "Define a parquet type with DEFTYPE which maps to LISP-TYPE."
2.159@@ -178,6 +203,8 @@
2.160 (defun load-parquet (&key (file *parquet-json-file*))
2.161 (init-parquet-json file)
2.162 (with-package (:dat/parquet)
2.163+ (define-parquet-class parquet-enum-object () ())
2.164+ (define-parquet-class parquet-struct-object () ())
2.165 (export (define-parquet-types))
2.166 (export (mapcar 'class-name (define-parquet-structs)))
2.167 (export *parquet-enums*)))
3.1--- a/lisp/lib/dat/parquet/parquet.lisp Fri Jul 12 19:57:18 2024 -0400
3.2+++ b/lisp/lib/dat/parquet/parquet.lisp Fri Jul 12 22:33:57 2024 -0400
3.3@@ -10,6 +10,28 @@
3.4 https://github.com/apache/parquet-testing
3.5 https://github.com/apache/parquet-java
3.6 https://github.com/apache/arrow-rs
3.7+
3.8+https://thrift.apache.org/docs/types
3.9+|#
3.10+
3.11+#|
3.12+ 4-byte magic number "PAR1"
3.13+ <Column 1 Chunk 1>
3.14+ <Column 2 Chunk 1>
3.15+ ...
3.16+ <Column N Chunk 1>
3.17+ <Column 1 Chunk 2>
3.18+ <Column 2 Chunk 2>
3.19+ ...
3.20+ <Column N Chunk 2>
3.21+ ...
3.22+ <Column 1 Chunk M>
3.23+ <Column 2 Chunk M>
3.24+ ...
3.25+ <Column N Chunk M>
3.26+ File Metadata
3.27+ 4-byte length in bytes of file metadata (little endian)
3.28+ 4-byte magic number "PAR1"
3.29 |#
3.30
3.31 ;; In this package we're being as lazy as possible. To generate our own
3.32@@ -21,12 +43,13 @@
3.33 ;;
3.34 ;;; Code:
3.35 (in-package :dat/parquet)
3.36-(eval-when (:compile-toplevel)
3.37- (load-parquet))
3.38+(eval-always
3.39+ (dat/parquet/gen::load-parquet))
3.40
3.41 (defgeneric parquet-read (value &optional stream))
3.42 (defgeneric parquet-write (value &optional stream))
3.43
3.44+;; HACK 2024-07-12:
3.45 (define-bitfield parquet-compression-codec
3.46 (uncompressed boolean)
3.47 (snappy boolean)
3.48@@ -38,6 +61,21 @@
3.49 (lz4-raw boolean))
3.50
3.51 ;;; Read/Write
3.52+(define-constant +parquet-magic-number+ "PAR1" :test 'equal)
3.53+
3.54+(defconstant +default-parquet-page-size+ (* 8 1024)) ;; 8kb
3.55+(defconstant +default-parquet-row-group-size (expt 1024 3)) ;; 1gb
3.56+
3.57+(defun parquet-write-magic (stream)
3.58+ (write-string +parquet-magic-number+ stream))
3.59+
3.60+(defun parquet-read-magic (stream)
3.61+ (assert (char= #.(char +parquet-magic-number+ 0) (read-char stream)))
3.62+ (assert (char= #.(char +parquet-magic-number+ 1) (read-char stream)))
3.63+ (assert (char= #.(char +parquet-magic-number+ 2) (read-char stream)))
3.64+ (assert (char= #.(char +parquet-magic-number+ 3) (read-char stream)))
3.65+ t)
3.66+
3.67 (defmethod parquet-write ((value (eql t)) &optional stream)
3.68 "Encode a parquet boolean true value."
3.69 (declare (ignore value))
4.1--- /dev/null Thu Jan 01 00:00:00 1970 +0000
4.2+++ b/lisp/lib/dat/parquet/pkg.lisp Fri Jul 12 22:33:57 2024 -0400
4.3@@ -0,0 +1,10 @@
4.4+;;; pkg.lisp --- Parquet Packages
4.5+
4.6+;;
4.7+
4.8+;;; Code:
4.9+(in-package :dat/parquet)
4.10+
4.11+(defpackage :dat/parquet/gen
4.12+ (:use :cl :std :dat/proto :dat/json)
4.13+ (:export :load-parquet))
5.1--- a/lisp/lib/dat/pkg.lisp Fri Jul 12 19:57:18 2024 -0400
5.2+++ b/lisp/lib/dat/pkg.lisp Fri Jul 12 22:33:57 2024 -0400
5.3@@ -233,12 +233,8 @@
5.4 (:use :cl :std :dat/proto)
5.5 (:export))
5.6
5.7-(defpackage :dat/parquet/gen
5.8- (:use :cl :std :dat/proto :dat/json))
5.9-
5.10 (defpackage :dat/parquet
5.11 (:use :cl :std :dat/proto :dat/json)
5.12- (:import-from :dat/parquet/gen :load-parquet)
5.13 (:export
5.14 :parquet-object
5.15 :parquet-enum-object
6.1--- a/lisp/lib/dat/tests.lisp Fri Jul 12 19:57:18 2024 -0400
6.2+++ b/lisp/lib/dat/tests.lisp Fri Jul 12 22:33:57 2024 -0400
6.3@@ -145,3 +145,11 @@
6.4 (read-sxp-stream f s))
6.5 (with-output-to-string (s)
6.6 (is (write-sxp-stream f s)))))
6.7+
6.8+(deftest parquet-basic ()
6.9+ (is
6.10+ (with-input-from-string
6.11+ (s
6.12+ (with-output-to-string (s)
6.13+ (dat/parquet::parquet-write-magic s)))
6.14+ (dat/parquet::parquet-read-magic s))))