changelog shortlog graph tags branches files raw help

Mercurial > core / changeset: more parquettiquette

changeset 540: bd49b7e2c623
parent 539: cf0c1933289f
child 541: 10c4bb778030
author: Richard Westhaver <ellis@rwest.io>
date: Fri, 12 Jul 2024 19:57:18 -0400
files: lisp/lib/dat/dat.asd lisp/lib/dat/parquet.lisp lisp/lib/dat/parquet/gen.lisp lisp/lib/dat/parquet/parquet.lisp lisp/lib/dat/pkg.lisp
description: more parquettiquette
     1.1--- a/lisp/lib/dat/dat.asd	Thu Jul 11 21:40:29 2024 -0400
     1.2+++ b/lisp/lib/dat/dat.asd	Fri Jul 12 19:57:18 2024 -0400
     1.3@@ -9,6 +9,10 @@
     1.4                (:file "dot")
     1.5                (:file "csv")
     1.6                (:file "json")
     1.7+               (:module "parquet"
     1.8+                :components
     1.9+                ((:file "gen")
    1.10+                 (:file "parquet")))
    1.11                (:module "xml"
    1.12                 :components
    1.13                 ((:file "xml")
     2.1--- a/lisp/lib/dat/parquet.lisp	Thu Jul 11 21:40:29 2024 -0400
     2.2+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.3@@ -1,119 +0,0 @@
     2.4-;;; parquet.lisp --- Apache Parquet
     2.5-
     2.6-;; Common Lisp implementation of Apache Parquet
     2.7-
     2.8-;;; Commentary:
     2.9-
    2.10-#|
    2.11-https://github.com/apache/parquet-format
    2.12-https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
    2.13-https://github.com/apache/parquet-testing
    2.14-https://github.com/apache/parquet-java
    2.15-https://github.com/apache/arrow-rs
    2.16-|#
    2.17-
    2.18-;; In this package we're being as lazy as possible. To generate our own
    2.19-;; encoder/decoder methods we depend on the file parquet.thrift in the
    2.20-;; parquet-format repo above. The core skelfile includes a script to download
    2.21-;; it and convert it to parquet.json (requires the thirft cli tool). We then
    2.22-;; decode it with DAT/JSON and visit all elements recursively, generating lisp
    2.23-;; code using pre-compiled macros.
    2.24-
    2.25-;; 
    2.26-;;; Code:
    2.27-(in-package :dat/parquet)
    2.28-
    2.29-(eval-always
    2.30-  (defparameter *default-parquet-json-file*
    2.31-    (probe-file #.(asdf:system-relative-pathname :prelude #P"../.stash/parquet.json")))
    2.32-  (defvar *parquet-json* nil)
    2.33-  (defun load-parquet-json (&optional (json-file *default-parquet-json-file*))
    2.34-    (with-open-file (file json-file)
    2.35-      (setq *parquet-json* (json-read file))))
    2.36-  (defun parquet-json-enums ()
    2.37-    (json-getf *parquet-json* "enums"))  
    2.38-
    2.39-  (defun parquet-json-enum-getf (name)
    2.40-    (json-getf
    2.41-     (find-if (lambda (x) (equal name (json-getf x "name"))) (parquet-json-enums))
    2.42-     "members"))
    2.43-
    2.44-  (defmacro def-parquet-enum (sym name)
    2.45-    `(progn
    2.46-       (defvar ,(symbolicate '*parquet- sym '*) nil)
    2.47-       (defun ,(symbolicate 'parquet-json- sym) ()
    2.48-         (mapcar (lambda (x) (json-getf x "name")) (parquet-json-enum-getf ,name)))))
    2.49-
    2.50-  (defvar *parquet-structs* nil)
    2.51-  (defstruct (parquet-struct
    2.52-              (:constructor make-parquet-struct (name doc exceptionp unionp fields)))
    2.53-    name doc exceptionp unionp (fields nil :type list))
    2.54-  (defstruct (parquet-struct-field
    2.55-              (:constructor make-parquet-struct-field (key name type-id type doc required)))
    2.56-    key name type-id type doc required)
    2.57-  (defun parquet-json-structs () ;; name doc isException isUnion fields
    2.58-    (mapcar
    2.59-     (lambda (s)
    2.60-       (let ((name (json-getf s "name"))
    2.61-             (doc (json-getf s "doc"))
    2.62-             (exceptionp (json-getf s "isException"))
    2.63-             (unionp (json-getf s "isUnion"))
    2.64-             (fields (loop for f in (json-getf s "fields")
    2.65-                           collect
    2.66-                              (let ((key (json-getf f "key"))
    2.67-                                    (name (json-getf f "name"))
    2.68-                                    (type-id (json-getf f "typeId"))
    2.69-                                    ;; json object - needs additional parsing
    2.70-                                    (type (print (json-getf f "type")))
    2.71-                                    (doc (json-getf f "doc"))
    2.72-                                    (required (json-getf f "required")))
    2.73-                                (make-parquet-struct-field key name type-id type doc required)))))
    2.74-         (make-parquet-struct name doc exceptionp unionp fields) *parquet-structs*))
    2.75-     (json-getf *parquet-json* "structs")))
    2.76-
    2.77-  (defun parquet-json-namespaces ()
    2.78-    (json-getf *parquet-json* "namespaces"))
    2.79-
    2.80-  (defun init-parquet-json ()
    2.81-    (load-parquet-json)
    2.82-    (def-parquet-enum types "Type")
    2.83-    (def-parquet-enum converted-types "ConvertedType")
    2.84-    (def-parquet-enum field-repetition-types "FieldRepetitionType")
    2.85-    (def-parquet-enum encodings "Encoding")
    2.86-    (def-parquet-enum compression-codecs "CompressionCodec")
    2.87-    (def-parquet-enum page-types "PageType")
    2.88-    (def-parquet-enum boundary-orders "BoundaryOrder")
    2.89-    (setq *parquet-structs* (parquet-json-structs))))
    2.90-
    2.91-(eval-when (:compile-toplevel)
    2.92-  (init-parquet-json))
    2.93-
    2.94-(defclass parquet-object () ())
    2.95-
    2.96-(defmethod print-object ((obj parquet-object) stream)
    2.97-  "Output a Parquet object to a stream."
    2.98-  (print-unreadable-object (obj stream :type t)
    2.99-    (parquet-encode obj stream)))
   2.100-
   2.101-(defmacro define-parquet-class (name superclasses slots &rest options)
   2.102-  "Define a new subclass of PARQUET-OBJECT with NAME."
   2.103-  `(defclass ,name ,(push 'parquet-object superclasses) ,slots ,@options))
   2.104-
   2.105-(define-parquet-class logical-parquet-object () ())
   2.106-
   2.107-(defgeneric parquet-read (value &optional stream))
   2.108-(defgeneric parquet-write (value &optional stream))
   2.109-
   2.110-(defmethod parquet-write ((value (eql t)) &optional stream)
   2.111-  "Encode a parquet boolean true value."
   2.112-  (declare (ignore value))
   2.113-  (write-byte 1 stream))
   2.114-
   2.115-(defmethod parquet-write ((value (eql nil)) &optional stream)
   2.116-  "Encode a parquet boolean false value."
   2.117-  (declare (ignore value))
   2.118-  (write-byte 0 stream))
   2.119-
   2.120-(defun parquet-encode (value &optional stream)
   2.121-  "Encode a Lisp value and write it to a parquet stream."
   2.122-  (parquet-write value stream))
     3.1--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2+++ b/lisp/lib/dat/parquet/gen.lisp	Fri Jul 12 19:57:18 2024 -0400
     3.3@@ -0,0 +1,183 @@
     3.4+;;; gen.lisp --- Parquet Lisp Code Generator
     3.5+
     3.6+;; 
     3.7+
     3.8+;;; Code:
     3.9+(in-package :dat/parquet/gen)
    3.10+(defparameter *parquet-json-file*
    3.11+  (probe-file #.(asdf:system-relative-pathname :prelude #P"../.stash/parquet.json")))
    3.12+(defvar *parquet-json* nil)
    3.13+(defun load-parquet-json (&optional (json-file *parquet-json-file*))
    3.14+  (with-open-file (file json-file)
    3.15+    (setq *parquet-json* (json-read file))))
    3.16+
    3.17+(defun %parquet-json-enums ()
    3.18+  (json-getf *parquet-json* "enums"))  
    3.19+
    3.20+(defun parquet-json-enum-getf (name)
    3.21+  (json-getf
    3.22+   (find-if (lambda (x) (equal name (json-getf x "name"))) (%parquet-json-enums))
    3.23+   "members"))
    3.24+
    3.25+(defvar *parquet-enums* nil)
    3.26+
    3.27+(defmacro def-parquet-enum (sym name)
    3.28+  `(progn
    3.29+     (defun ,(symbolicate 'parquet-json- sym) ()
    3.30+       (mapcar (lambda (x) (json-getf x "name")) (parquet-json-enum-getf ,name)))
    3.31+     (defparameter ,(intern
    3.32+                     (concatenate 'string "*PARQUET-" (symbol-name sym) "*")
    3.33+                     :dat/parquet)
    3.34+       (,(symbolicate 'parquet-json- sym)))))
    3.35+
    3.36+(defun camelcase-name-to-lisp-name (string)
    3.37+  (string-upcase
    3.38+   (with-output-to-string (name)
    3.39+     (loop for i from 0 below (length string)
    3.40+           for c across string
    3.41+           when (and (upper-case-p c) (not (zerop i)))
    3.42+           do (write-char #\- name)
    3.43+           do (write-char c name)))))
    3.44+
    3.45+(defun snakecase-name-to-lisp-name (string)
    3.46+  (string-upcase
    3.47+   (substitute #\- #\_ string)))
    3.48+
    3.49+(defun parquet-json-enums ()
    3.50+  (list
    3.51+   (def-parquet-enum types "Type")
    3.52+   (def-parquet-enum converted-types "ConvertedType")
    3.53+   (def-parquet-enum field-repetition-types "FieldRepetitionType")
    3.54+   (def-parquet-enum encodings "Encoding")
    3.55+   (def-parquet-enum compression-codecs "CompressionCodec")
    3.56+   (def-parquet-enum page-types "PageType")
    3.57+   (def-parquet-enum boundary-orders "BoundaryOrder")))
    3.58+
    3.59+(defvar *parquet-structs* nil)
    3.60+(defstruct (parquet-struct
    3.61+            (:constructor make-parquet-struct (name doc exceptionp unionp fields)))
    3.62+  name doc exceptionp unionp (fields nil :type list))
    3.63+
    3.64+(defstruct (parquet-struct-field
    3.65+            (:constructor make-parquet-struct-field (key name type-id type doc required)))
    3.66+  key name type-id type doc required)
    3.67+
    3.68+(defun parquet-destruct-field (field)
    3.69+  (list (parquet-struct-field-name field)
    3.70+        (parquet-struct-field-key field)
    3.71+        (parquet-struct-field-doc field)
    3.72+        (parquet-struct-field-type-id field)
    3.73+        (parquet-struct-field-type field)
    3.74+        (parquet-struct-field-required field)))
    3.75+
    3.76+(defun parquet-destruct (struct)
    3.77+  (list (parquet-struct-name struct)
    3.78+        (parquet-struct-doc struct)
    3.79+        (parquet-struct-unionp struct)
    3.80+        (parquet-struct-exceptionp struct)
    3.81+        (mapcar #'parquet-destruct-field (parquet-struct-fields struct))))
    3.82+
    3.83+(flet ((pq-type-parse (o) (let ((id (json-getf o "typeId")))
    3.84+                            (string-case (id :default (warn 'simple-warning :format-control "unknown typeId: ~A"
    3.85+                                                                            :format-arguments (list id)))
    3.86+                              ("list" (cons id (json-getf o "elemTypeId")))
    3.87+                              ("union" (cons id (json-getf o "class")))
    3.88+                              ("struct" (cons id (json-getf o "class")))
    3.89+                              ("enum" (cons id (json-getf o "class")))))))
    3.90+  (defun parquet-json-structs () ;; name doc isException isUnion fields
    3.91+  (mapcar
    3.92+   (lambda (s)
    3.93+     (let ((name (json-getf s "name"))
    3.94+           (doc (json-getf s "doc"))
    3.95+           (exceptionp (json-getf s "isException"))
    3.96+           (unionp (json-getf s "isUnion"))
    3.97+           (fields (loop for f in (json-getf s "fields")
    3.98+                         collect
    3.99+                            (let ((key (json-getf f "key"))
   3.100+                                  (name (json-getf f "name"))
   3.101+                                  (type-id (json-getf f "typeId"))
   3.102+                                  ;; json object - needs additional parsing
   3.103+                                  (type (when-let ((ty (json-getf f "type")))
   3.104+                                          (pq-type-parse ty)))
   3.105+                                  (doc (json-getf f "doc"))
   3.106+                                  (required (json-getf f "required")))
   3.107+                              (make-parquet-struct-field key name type-id type doc required)))))
   3.108+       (make-parquet-struct name doc exceptionp unionp fields)))
   3.109+   (json-getf *parquet-json* "structs"))))
   3.110+
   3.111+(defun parquet-json-namespaces ()
   3.112+  (json-getf *parquet-json* "namespaces"))
   3.113+
   3.114+(defun init-parquet-json (&optional (file *parquet-json-file*))
   3.115+  (load-parquet-json file)
   3.116+  (setq *parquet-enums* (parquet-json-enums))
   3.117+  (setq *parquet-structs* (parquet-json-structs)))
   3.118+
   3.119+;;; CLOS
   3.120+(defclass parquet-object () ())
   3.121+
   3.122+;; (defmethod print-object ((obj parquet-object) stream)
   3.123+;;   "Output a Parquet object to a stream."
   3.124+;;   (print-unreadable-object (obj stream :type t)))
   3.125+
   3.126+(defmacro define-parquet-class (name superclasses slots &rest options)
   3.127+  "Define a new subclass of PARQUET-OBJECT with NAME."
   3.128+  `(defclass ,name ,@(if-let ((s superclasses)) (list s) `((parquet-object))) ,slots ,@options))
   3.129+
   3.130+(define-parquet-class dat/parquet:parquet-enum-object () ())
   3.131+(define-parquet-class dat/parquet:parquet-struct-object () ())
   3.132+
   3.133+;;; Codegen
   3.134+
   3.135+;; 8)
   3.136+(defun %define-parquet-structs ()
   3.137+  "Define all known values in *PARQUET-STRUCTS* using DEFINE-PARQUET-CLASS (DEFCLASS)."
   3.138+      (loop for struct in *parquet-structs*
   3.139+            unless (null struct)
   3.140+            collect (let ((name (parquet-struct-name struct))
   3.141+                          (doc (parquet-struct-doc struct))
   3.142+                          (fields (parquet-struct-fields struct)))
   3.143+                      `(define-parquet-class ,(intern (concatenate 'string
   3.144+                                                                   "PARQUET-"
   3.145+                                                                   (camelcase-name-to-lisp-name name))
   3.146+                                                      :dat/parquet)
   3.147+                           (parquet-struct-object)
   3.148+                         (,@(mapcar (lambda (f)
   3.149+                                      (let ((fdoc (parquet-struct-field-doc f))
   3.150+                                            (fname (snakecase-name-to-lisp-name
   3.151+                                                    (parquet-struct-field-name f))))
   3.152+                                        `(,(symbolicate fname)
   3.153+                                          ,@(when fdoc `(:documentation ,fdoc))
   3.154+                                          :initarg ,(keywordicate fname)
   3.155+                                          ;; TODO 2024-07-12: 
   3.156+                                          ,@(when (equal "optional" (parquet-struct-field-required f))
   3.157+                                              `(:initform nil)))))
   3.158+                                    fields))
   3.159+                         ,@(when doc `((:documentation ,doc)))))))
   3.160+
   3.161+(defmacro define-parquet-structs ()
   3.162+  `(list
   3.163+     ,@(%define-parquet-structs)))
   3.164+
   3.165+(defmacro define-parquet-type (name opts &body body)
   3.166+  "Define a parquet type with DEFTYPE which maps to LISP-TYPE."
   3.167+  `(deftype ,(intern (concatenate 'string "PARQUET-" (substitute #\- #\_ name)) :dat/parquet) ,opts ,@body))
   3.168+
   3.169+(defun define-parquet-types ()
   3.170+  "Define all known values in *PARQUET-TYPES* using DEFINE-PARQUET-TYPE (DEFTYPE)."
   3.171+  (list
   3.172+   (define-parquet-type "BOOLEAN" () 'boolean)
   3.173+   (define-parquet-type "INT32" () '(signed-byte 32))
   3.174+   (define-parquet-type "INT64" () '(signed-byte 64))
   3.175+   (define-parquet-type "INT96" () '(signed-byte 96))
   3.176+   (define-parquet-type "FLOAT" () 'float)
   3.177+   (define-parquet-type "DOUBLE" () 'double-float)
   3.178+   (define-parquet-type "BYTE_ARRAY" (&optional size) `(octet-vector ,size))
   3.179+   (define-parquet-type "FIXED_LEN_BYTE_ARRAY" (size) `(octet-vector ,size))))
   3.180+
   3.181+(defun load-parquet (&key (file *parquet-json-file*))
   3.182+  (init-parquet-json file)
   3.183+  (with-package (:dat/parquet)
   3.184+    (export (define-parquet-types))
   3.185+    (export (mapcar 'class-name (define-parquet-structs)))
   3.186+    (export *parquet-enums*)))
     4.1--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2+++ b/lisp/lib/dat/parquet/parquet.lisp	Fri Jul 12 19:57:18 2024 -0400
     4.3@@ -0,0 +1,62 @@
     4.4+;;; parquet.lisp --- Apache Parquet
     4.5+
     4.6+;; Common Lisp implementation of Apache Parquet
     4.7+
     4.8+;;; Commentary:
     4.9+
    4.10+#|
    4.11+https://github.com/apache/parquet-format
    4.12+https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
    4.13+https://github.com/apache/parquet-testing
    4.14+https://github.com/apache/parquet-java
    4.15+https://github.com/apache/arrow-rs
    4.16+|#
    4.17+
    4.18+;; In this package we're being as lazy as possible. To generate our own
    4.19+;; encoder/decoder methods we depend on the file parquet.thrift in the
    4.20+;; parquet-format repo above. The core skelfile includes a script to download
    4.21+;; it and convert it to parquet.json (requires the thirft cli tool). We then
    4.22+;; decode it with DAT/JSON and generate lisp classes, and types.
    4.23+
    4.24+;; 
    4.25+;;; Code:
    4.26+(in-package :dat/parquet)
    4.27+(eval-when (:compile-toplevel)
    4.28+  (load-parquet))
    4.29+
    4.30+(defgeneric parquet-read (value &optional stream))
    4.31+(defgeneric parquet-write (value &optional stream))
    4.32+
    4.33+(define-bitfield parquet-compression-codec
    4.34+  (uncompressed boolean)
    4.35+  (snappy boolean)
    4.36+  (gzip boolean)
    4.37+  (lzo boolean)
    4.38+  (brotli boolean)
    4.39+  (lz4 boolean)
    4.40+  (zstd boolean)
    4.41+  (lz4-raw boolean))
    4.42+
    4.43+;;; Read/Write
    4.44+(defmethod parquet-write ((value (eql t)) &optional stream)
    4.45+  "Encode a parquet boolean true value."
    4.46+  (declare (ignore value))
    4.47+  (write-byte 1 stream))
    4.48+
    4.49+(defmethod parquet-write ((value (eql nil)) &optional stream)
    4.50+  "Encode a parquet boolean false value."
    4.51+  (declare (ignore value))
    4.52+  (write-byte 0 stream))
    4.53+
    4.54+(defmethod parquet-write ((value string) &optional stream))
    4.55+
    4.56+;;; Encode/Decode
    4.57+(defun parquet-encode (value &optional stream)
    4.58+  "Encode a Lisp value and write it to a parquet stream."
    4.59+  (parquet-write value stream))
    4.60+
    4.61+(defun parquet-decode (string &key (start 0) end)
    4.62+  "Convert a PARQUET string into a Lisp object."
    4.63+  (with-input-from-string (stream string :start start :end end)
    4.64+    (values (parquet-read stream)
    4.65+            (file-position stream))))
     5.1--- a/lisp/lib/dat/pkg.lisp	Thu Jul 11 21:40:29 2024 -0400
     5.2+++ b/lisp/lib/dat/pkg.lisp	Fri Jul 12 19:57:18 2024 -0400
     5.3@@ -233,9 +233,20 @@
     5.4   (:use :cl :std :dat/proto)
     5.5   (:export))
     5.6 
     5.7+(defpackage :dat/parquet/gen
     5.8+  (:use :cl :std :dat/proto :dat/json))
     5.9+
    5.10 (defpackage :dat/parquet
    5.11   (:use :cl :std :dat/proto :dat/json)
    5.12-  (:export))
    5.13+  (:import-from :dat/parquet/gen :load-parquet)
    5.14+  (:export
    5.15+   :parquet-object
    5.16+   :parquet-enum-object
    5.17+   :parquet-struct-object
    5.18+   :parquet-read
    5.19+   :parquet-write
    5.20+   :parquet-encode
    5.21+   :parquet-decode))
    5.22 
    5.23 (pkg:defpkg :dat
    5.24   (:use-reexport :dat/proto :dat/csv :dat/arff