changeset 540: |
bd49b7e2c623 |
parent 539: |
cf0c1933289f |
child 541: |
10c4bb778030 |
author: |
Richard Westhaver <ellis@rwest.io> |
date: |
Fri, 12 Jul 2024 19:57:18 -0400 |
files: |
lisp/lib/dat/dat.asd lisp/lib/dat/parquet.lisp lisp/lib/dat/parquet/gen.lisp lisp/lib/dat/parquet/parquet.lisp lisp/lib/dat/pkg.lisp |
description: |
more parquettiquette |
1.1--- a/lisp/lib/dat/dat.asd Thu Jul 11 21:40:29 2024 -0400
1.2+++ b/lisp/lib/dat/dat.asd Fri Jul 12 19:57:18 2024 -0400
1.3@@ -9,6 +9,10 @@
1.4 (:file "dot")
1.5 (:file "csv")
1.6 (:file "json")
1.7+ (:module "parquet"
1.8+ :components
1.9+ ((:file "gen")
1.10+ (:file "parquet")))
1.11 (:module "xml"
1.12 :components
1.13 ((:file "xml")
2.1--- a/lisp/lib/dat/parquet.lisp Thu Jul 11 21:40:29 2024 -0400
2.2+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
2.3@@ -1,119 +0,0 @@
2.4-;;; parquet.lisp --- Apache Parquet
2.5-
2.6-;; Common Lisp implementation of Apache Parquet
2.7-
2.8-;;; Commentary:
2.9-
2.10-#|
2.11-https://github.com/apache/parquet-format
2.12-https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
2.13-https://github.com/apache/parquet-testing
2.14-https://github.com/apache/parquet-java
2.15-https://github.com/apache/arrow-rs
2.16-|#
2.17-
2.18-;; In this package we're being as lazy as possible. To generate our own
2.19-;; encoder/decoder methods we depend on the file parquet.thrift in the
2.20-;; parquet-format repo above. The core skelfile includes a script to download
2.21-;; it and convert it to parquet.json (requires the thirft cli tool). We then
2.22-;; decode it with DAT/JSON and visit all elements recursively, generating lisp
2.23-;; code using pre-compiled macros.
2.24-
2.25-;;
2.26-;;; Code:
2.27-(in-package :dat/parquet)
2.28-
2.29-(eval-always
2.30- (defparameter *default-parquet-json-file*
2.31- (probe-file #.(asdf:system-relative-pathname :prelude #P"../.stash/parquet.json")))
2.32- (defvar *parquet-json* nil)
2.33- (defun load-parquet-json (&optional (json-file *default-parquet-json-file*))
2.34- (with-open-file (file json-file)
2.35- (setq *parquet-json* (json-read file))))
2.36- (defun parquet-json-enums ()
2.37- (json-getf *parquet-json* "enums"))
2.38-
2.39- (defun parquet-json-enum-getf (name)
2.40- (json-getf
2.41- (find-if (lambda (x) (equal name (json-getf x "name"))) (parquet-json-enums))
2.42- "members"))
2.43-
2.44- (defmacro def-parquet-enum (sym name)
2.45- `(progn
2.46- (defvar ,(symbolicate '*parquet- sym '*) nil)
2.47- (defun ,(symbolicate 'parquet-json- sym) ()
2.48- (mapcar (lambda (x) (json-getf x "name")) (parquet-json-enum-getf ,name)))))
2.49-
2.50- (defvar *parquet-structs* nil)
2.51- (defstruct (parquet-struct
2.52- (:constructor make-parquet-struct (name doc exceptionp unionp fields)))
2.53- name doc exceptionp unionp (fields nil :type list))
2.54- (defstruct (parquet-struct-field
2.55- (:constructor make-parquet-struct-field (key name type-id type doc required)))
2.56- key name type-id type doc required)
2.57- (defun parquet-json-structs () ;; name doc isException isUnion fields
2.58- (mapcar
2.59- (lambda (s)
2.60- (let ((name (json-getf s "name"))
2.61- (doc (json-getf s "doc"))
2.62- (exceptionp (json-getf s "isException"))
2.63- (unionp (json-getf s "isUnion"))
2.64- (fields (loop for f in (json-getf s "fields")
2.65- collect
2.66- (let ((key (json-getf f "key"))
2.67- (name (json-getf f "name"))
2.68- (type-id (json-getf f "typeId"))
2.69- ;; json object - needs additional parsing
2.70- (type (print (json-getf f "type")))
2.71- (doc (json-getf f "doc"))
2.72- (required (json-getf f "required")))
2.73- (make-parquet-struct-field key name type-id type doc required)))))
2.74- (make-parquet-struct name doc exceptionp unionp fields) *parquet-structs*))
2.75- (json-getf *parquet-json* "structs")))
2.76-
2.77- (defun parquet-json-namespaces ()
2.78- (json-getf *parquet-json* "namespaces"))
2.79-
2.80- (defun init-parquet-json ()
2.81- (load-parquet-json)
2.82- (def-parquet-enum types "Type")
2.83- (def-parquet-enum converted-types "ConvertedType")
2.84- (def-parquet-enum field-repetition-types "FieldRepetitionType")
2.85- (def-parquet-enum encodings "Encoding")
2.86- (def-parquet-enum compression-codecs "CompressionCodec")
2.87- (def-parquet-enum page-types "PageType")
2.88- (def-parquet-enum boundary-orders "BoundaryOrder")
2.89- (setq *parquet-structs* (parquet-json-structs))))
2.90-
2.91-(eval-when (:compile-toplevel)
2.92- (init-parquet-json))
2.93-
2.94-(defclass parquet-object () ())
2.95-
2.96-(defmethod print-object ((obj parquet-object) stream)
2.97- "Output a Parquet object to a stream."
2.98- (print-unreadable-object (obj stream :type t)
2.99- (parquet-encode obj stream)))
2.100-
2.101-(defmacro define-parquet-class (name superclasses slots &rest options)
2.102- "Define a new subclass of PARQUET-OBJECT with NAME."
2.103- `(defclass ,name ,(push 'parquet-object superclasses) ,slots ,@options))
2.104-
2.105-(define-parquet-class logical-parquet-object () ())
2.106-
2.107-(defgeneric parquet-read (value &optional stream))
2.108-(defgeneric parquet-write (value &optional stream))
2.109-
2.110-(defmethod parquet-write ((value (eql t)) &optional stream)
2.111- "Encode a parquet boolean true value."
2.112- (declare (ignore value))
2.113- (write-byte 1 stream))
2.114-
2.115-(defmethod parquet-write ((value (eql nil)) &optional stream)
2.116- "Encode a parquet boolean false value."
2.117- (declare (ignore value))
2.118- (write-byte 0 stream))
2.119-
2.120-(defun parquet-encode (value &optional stream)
2.121- "Encode a Lisp value and write it to a parquet stream."
2.122- (parquet-write value stream))
3.1--- /dev/null Thu Jan 01 00:00:00 1970 +0000
3.2+++ b/lisp/lib/dat/parquet/gen.lisp Fri Jul 12 19:57:18 2024 -0400
3.3@@ -0,0 +1,183 @@
3.4+;;; gen.lisp --- Parquet Lisp Code Generator
3.5+
3.6+;;
3.7+
3.8+;;; Code:
3.9+(in-package :dat/parquet/gen)
3.10+(defparameter *parquet-json-file*
3.11+ (probe-file #.(asdf:system-relative-pathname :prelude #P"../.stash/parquet.json")))
3.12+(defvar *parquet-json* nil)
3.13+(defun load-parquet-json (&optional (json-file *parquet-json-file*))
3.14+ (with-open-file (file json-file)
3.15+ (setq *parquet-json* (json-read file))))
3.16+
3.17+(defun %parquet-json-enums ()
3.18+ (json-getf *parquet-json* "enums"))
3.19+
3.20+(defun parquet-json-enum-getf (name)
3.21+ (json-getf
3.22+ (find-if (lambda (x) (equal name (json-getf x "name"))) (%parquet-json-enums))
3.23+ "members"))
3.24+
3.25+(defvar *parquet-enums* nil)
3.26+
3.27+(defmacro def-parquet-enum (sym name)
3.28+ `(progn
3.29+ (defun ,(symbolicate 'parquet-json- sym) ()
3.30+ (mapcar (lambda (x) (json-getf x "name")) (parquet-json-enum-getf ,name)))
3.31+ (defparameter ,(intern
3.32+ (concatenate 'string "*PARQUET-" (symbol-name sym) "*")
3.33+ :dat/parquet)
3.34+ (,(symbolicate 'parquet-json- sym)))))
3.35+
3.36+(defun camelcase-name-to-lisp-name (string)
3.37+ (string-upcase
3.38+ (with-output-to-string (name)
3.39+ (loop for i from 0 below (length string)
3.40+ for c across string
3.41+ when (and (upper-case-p c) (not (zerop i)))
3.42+ do (write-char #\- name)
3.43+ do (write-char c name)))))
3.44+
3.45+(defun snakecase-name-to-lisp-name (string)
3.46+ (string-upcase
3.47+ (substitute #\- #\_ string)))
3.48+
3.49+(defun parquet-json-enums ()
3.50+ (list
3.51+ (def-parquet-enum types "Type")
3.52+ (def-parquet-enum converted-types "ConvertedType")
3.53+ (def-parquet-enum field-repetition-types "FieldRepetitionType")
3.54+ (def-parquet-enum encodings "Encoding")
3.55+ (def-parquet-enum compression-codecs "CompressionCodec")
3.56+ (def-parquet-enum page-types "PageType")
3.57+ (def-parquet-enum boundary-orders "BoundaryOrder")))
3.58+
3.59+(defvar *parquet-structs* nil)
3.60+(defstruct (parquet-struct
3.61+ (:constructor make-parquet-struct (name doc exceptionp unionp fields)))
3.62+ name doc exceptionp unionp (fields nil :type list))
3.63+
3.64+(defstruct (parquet-struct-field
3.65+ (:constructor make-parquet-struct-field (key name type-id type doc required)))
3.66+ key name type-id type doc required)
3.67+
3.68+(defun parquet-destruct-field (field)
3.69+ (list (parquet-struct-field-name field)
3.70+ (parquet-struct-field-key field)
3.71+ (parquet-struct-field-doc field)
3.72+ (parquet-struct-field-type-id field)
3.73+ (parquet-struct-field-type field)
3.74+ (parquet-struct-field-required field)))
3.75+
3.76+(defun parquet-destruct (struct)
3.77+ (list (parquet-struct-name struct)
3.78+ (parquet-struct-doc struct)
3.79+ (parquet-struct-unionp struct)
3.80+ (parquet-struct-exceptionp struct)
3.81+ (mapcar #'parquet-destruct-field (parquet-struct-fields struct))))
3.82+
3.83+(flet ((pq-type-parse (o) (let ((id (json-getf o "typeId")))
3.84+ (string-case (id :default (warn 'simple-warning :format-control "unknown typeId: ~A"
3.85+ :format-arguments (list id)))
3.86+ ("list" (cons id (json-getf o "elemTypeId")))
3.87+ ("union" (cons id (json-getf o "class")))
3.88+ ("struct" (cons id (json-getf o "class")))
3.89+ ("enum" (cons id (json-getf o "class")))))))
3.90+ (defun parquet-json-structs () ;; name doc isException isUnion fields
3.91+ (mapcar
3.92+ (lambda (s)
3.93+ (let ((name (json-getf s "name"))
3.94+ (doc (json-getf s "doc"))
3.95+ (exceptionp (json-getf s "isException"))
3.96+ (unionp (json-getf s "isUnion"))
3.97+ (fields (loop for f in (json-getf s "fields")
3.98+ collect
3.99+ (let ((key (json-getf f "key"))
3.100+ (name (json-getf f "name"))
3.101+ (type-id (json-getf f "typeId"))
3.102+ ;; json object - needs additional parsing
3.103+ (type (when-let ((ty (json-getf f "type")))
3.104+ (pq-type-parse ty)))
3.105+ (doc (json-getf f "doc"))
3.106+ (required (json-getf f "required")))
3.107+ (make-parquet-struct-field key name type-id type doc required)))))
3.108+ (make-parquet-struct name doc exceptionp unionp fields)))
3.109+ (json-getf *parquet-json* "structs"))))
3.110+
3.111+(defun parquet-json-namespaces ()
3.112+ (json-getf *parquet-json* "namespaces"))
3.113+
3.114+(defun init-parquet-json (&optional (file *parquet-json-file*))
3.115+ (load-parquet-json file)
3.116+ (setq *parquet-enums* (parquet-json-enums))
3.117+ (setq *parquet-structs* (parquet-json-structs)))
3.118+
3.119+;;; CLOS
3.120+(defclass parquet-object () ())
3.121+
3.122+;; (defmethod print-object ((obj parquet-object) stream)
3.123+;; "Output a Parquet object to a stream."
3.124+;; (print-unreadable-object (obj stream :type t)))
3.125+
3.126+(defmacro define-parquet-class (name superclasses slots &rest options)
3.127+ "Define a new subclass of PARQUET-OBJECT with NAME."
3.128+ `(defclass ,name ,@(if-let ((s superclasses)) (list s) `((parquet-object))) ,slots ,@options))
3.129+
3.130+(define-parquet-class dat/parquet:parquet-enum-object () ())
3.131+(define-parquet-class dat/parquet:parquet-struct-object () ())
3.132+
3.133+;;; Codegen
3.134+
3.135+;; 8)
3.136+(defun %define-parquet-structs ()
3.137+ "Define all known values in *PARQUET-STRUCTS* using DEFINE-PARQUET-CLASS (DEFCLASS)."
3.138+ (loop for struct in *parquet-structs*
3.139+ unless (null struct)
3.140+ collect (let ((name (parquet-struct-name struct))
3.141+ (doc (parquet-struct-doc struct))
3.142+ (fields (parquet-struct-fields struct)))
3.143+ `(define-parquet-class ,(intern (concatenate 'string
3.144+ "PARQUET-"
3.145+ (camelcase-name-to-lisp-name name))
3.146+ :dat/parquet)
3.147+ (parquet-struct-object)
3.148+ (,@(mapcar (lambda (f)
3.149+ (let ((fdoc (parquet-struct-field-doc f))
3.150+ (fname (snakecase-name-to-lisp-name
3.151+ (parquet-struct-field-name f))))
3.152+ `(,(symbolicate fname)
3.153+ ,@(when fdoc `(:documentation ,fdoc))
3.154+ :initarg ,(keywordicate fname)
3.155+ ;; TODO 2024-07-12:
3.156+ ,@(when (equal "optional" (parquet-struct-field-required f))
3.157+ `(:initform nil)))))
3.158+ fields))
3.159+ ,@(when doc `((:documentation ,doc)))))))
3.160+
3.161+(defmacro define-parquet-structs ()
3.162+ `(list
3.163+ ,@(%define-parquet-structs)))
3.164+
3.165+(defmacro define-parquet-type (name opts &body body)
3.166+ "Define a parquet type with DEFTYPE which maps to LISP-TYPE."
3.167+ `(deftype ,(intern (concatenate 'string "PARQUET-" (substitute #\- #\_ name)) :dat/parquet) ,opts ,@body))
3.168+
3.169+(defun define-parquet-types ()
3.170+ "Define all known values in *PARQUET-TYPES* using DEFINE-PARQUET-TYPE (DEFTYPE)."
3.171+ (list
3.172+ (define-parquet-type "BOOLEAN" () 'boolean)
3.173+ (define-parquet-type "INT32" () '(signed-byte 32))
3.174+ (define-parquet-type "INT64" () '(signed-byte 64))
3.175+ (define-parquet-type "INT96" () '(signed-byte 96))
3.176+ (define-parquet-type "FLOAT" () 'float)
3.177+ (define-parquet-type "DOUBLE" () 'double-float)
3.178+ (define-parquet-type "BYTE_ARRAY" (&optional size) `(octet-vector ,size))
3.179+ (define-parquet-type "FIXED_LEN_BYTE_ARRAY" (size) `(octet-vector ,size))))
3.180+
3.181+(defun load-parquet (&key (file *parquet-json-file*))
3.182+ (init-parquet-json file)
3.183+ (with-package (:dat/parquet)
3.184+ (export (define-parquet-types))
3.185+ (export (mapcar 'class-name (define-parquet-structs)))
3.186+ (export *parquet-enums*)))
4.1--- /dev/null Thu Jan 01 00:00:00 1970 +0000
4.2+++ b/lisp/lib/dat/parquet/parquet.lisp Fri Jul 12 19:57:18 2024 -0400
4.3@@ -0,0 +1,62 @@
4.4+;;; parquet.lisp --- Apache Parquet
4.5+
4.6+;; Common Lisp implementation of Apache Parquet
4.7+
4.8+;;; Commentary:
4.9+
4.10+#|
4.11+https://github.com/apache/parquet-format
4.12+https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
4.13+https://github.com/apache/parquet-testing
4.14+https://github.com/apache/parquet-java
4.15+https://github.com/apache/arrow-rs
4.16+|#
4.17+
4.18+;; In this package we're being as lazy as possible. To generate our own
4.19+;; encoder/decoder methods we depend on the file parquet.thrift in the
4.20+;; parquet-format repo above. The core skelfile includes a script to download
4.21+;; it and convert it to parquet.json (requires the thirft cli tool). We then
4.22+;; decode it with DAT/JSON and generate lisp classes, and types.
4.23+
4.24+;;
4.25+;;; Code:
4.26+(in-package :dat/parquet)
4.27+(eval-when (:compile-toplevel)
4.28+ (load-parquet))
4.29+
4.30+(defgeneric parquet-read (value &optional stream))
4.31+(defgeneric parquet-write (value &optional stream))
4.32+
4.33+(define-bitfield parquet-compression-codec
4.34+ (uncompressed boolean)
4.35+ (snappy boolean)
4.36+ (gzip boolean)
4.37+ (lzo boolean)
4.38+ (brotli boolean)
4.39+ (lz4 boolean)
4.40+ (zstd boolean)
4.41+ (lz4-raw boolean))
4.42+
4.43+;;; Read/Write
4.44+(defmethod parquet-write ((value (eql t)) &optional stream)
4.45+ "Encode a parquet boolean true value."
4.46+ (declare (ignore value))
4.47+ (write-byte 1 stream))
4.48+
4.49+(defmethod parquet-write ((value (eql nil)) &optional stream)
4.50+ "Encode a parquet boolean false value."
4.51+ (declare (ignore value))
4.52+ (write-byte 0 stream))
4.53+
4.54+(defmethod parquet-write ((value string) &optional stream))
4.55+
4.56+;;; Encode/Decode
4.57+(defun parquet-encode (value &optional stream)
4.58+ "Encode a Lisp value and write it to a parquet stream."
4.59+ (parquet-write value stream))
4.60+
4.61+(defun parquet-decode (string &key (start 0) end)
4.62+ "Convert a PARQUET string into a Lisp object."
4.63+ (with-input-from-string (stream string :start start :end end)
4.64+ (values (parquet-read stream)
4.65+ (file-position stream))))
5.1--- a/lisp/lib/dat/pkg.lisp Thu Jul 11 21:40:29 2024 -0400
5.2+++ b/lisp/lib/dat/pkg.lisp Fri Jul 12 19:57:18 2024 -0400
5.3@@ -233,9 +233,20 @@
5.4 (:use :cl :std :dat/proto)
5.5 (:export))
5.6
5.7+(defpackage :dat/parquet/gen
5.8+ (:use :cl :std :dat/proto :dat/json))
5.9+
5.10 (defpackage :dat/parquet
5.11 (:use :cl :std :dat/proto :dat/json)
5.12- (:export))
5.13+ (:import-from :dat/parquet/gen :load-parquet)
5.14+ (:export
5.15+ :parquet-object
5.16+ :parquet-enum-object
5.17+ :parquet-struct-object
5.18+ :parquet-read
5.19+ :parquet-write
5.20+ :parquet-encode
5.21+ :parquet-decode))
5.22
5.23 (pkg:defpkg :dat
5.24 (:use-reexport :dat/proto :dat/csv :dat/arff