changeset 539: |
cf0c1933289f |
parent 538: |
d84e518059be |
child 540: |
bd49b7e2c623 |
author: |
Richard Westhaver <ellis@rwest.io> |
date: |
Thu, 11 Jul 2024 21:40:29 -0400 |
files: |
lisp/ffi/arrow/arrow.asd lisp/ffi/arrow/pkg.lisp lisp/ffi/arrow/tests.lisp lisp/lib/dat/parquet.lisp |
description: |
parquet+arrow |
1.1--- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2+++ b/lisp/ffi/arrow/arrow.asd Thu Jul 11 21:40:29 2024 -0400
1.3@@ -0,0 +1,11 @@
1.4+;;; arrow.asd --- Apache Arrow Sytem Definitions
1.5+(defsystem :arrow
1.6+ :depends-on (:std :log)
1.7+ :description "A thin FFI wrapper for Arrow."
1.8+ :components ((:file "pkg"))
1.9+ :in-order-to ((test-op (test-op "arrow/tests"))))
1.10+
1.11+(defsystem :arrow/tests
1.12+ :depends-on (:std :log :rt :arrow)
1.13+ :components ((:file "tests"))
1.14+ :perform (test-op (o c) (symbol-call :rt :do-tests :arrow)))
2.1--- /dev/null Thu Jan 01 00:00:00 1970 +0000
2.2+++ b/lisp/ffi/arrow/pkg.lisp Thu Jul 11 21:40:29 2024 -0400
2.3@@ -0,0 +1,40 @@
2.4+;;; pkg.lisp --- Apache Arrow FFI
2.5+
2.6+;;
2.7+
2.8+;;; Code:
2.9+(defpackage :arrow
2.10+ (:use :cl :std :sb-alien)
2.11+ (:export))
2.12+
2.13+(in-package :arrow)
2.14+
2.15+(define-alien-loader "arrow" t "/usr/lib/")
2.16+
2.17+(define-alien-type arrow-release-function (function void (* (struct nil))))
2.18+
2.19+(define-alien-type arrow-schema
2.20+ (struct arrow-schema
2.21+ (format c-string)
2.22+ (name c-string)
2.23+ (metadata c-string)
2.24+ (flags long)
2.25+ (n-children long)
2.26+ (children (array (* (struct arrow-schema))))
2.27+ (dictionary (* (struct arrow-schema)))
2.28+ (release (* arrow-release-function))
2.29+ (private-data (* t))))
2.30+
2.31+
2.32+(define-alien-type arrow-array
2.33+ (struct arrow-array
2.34+ (length long)
2.35+ (null-count long)
2.36+ (offset long)
2.37+ (n-buffers long)
2.38+ (n-children long)
2.39+ (buffers (array (* t)))
2.40+ (children (array (* (struct arrow-array))))
2.41+ (dictionary (* (struct arrow-array)))
2.42+ (release (* arrow-release-function))
2.43+ (private-data (* t))))
3.1--- /dev/null Thu Jan 01 00:00:00 1970 +0000
3.2+++ b/lisp/ffi/arrow/tests.lisp Thu Jul 11 21:40:29 2024 -0400
3.3@@ -0,0 +1,12 @@
3.4+;;; tests.lisp --- Apache Arrow FFI Tests
3.5+
3.6+;;
3.7+
3.8+;;; Code:
3.9+(defpackage :arrow/tests
3.10+ (:use :cl :std :sb-alien :rt :arrow))
3.11+(in-package :arrow/tests)
3.12+(defsuite :arrow)
3.13+(in-suite :arrow)
3.14+(load-arrow)
3.15+(deftest sanity ())
4.1--- /dev/null Thu Jan 01 00:00:00 1970 +0000
4.2+++ b/lisp/lib/dat/parquet.lisp Thu Jul 11 21:40:29 2024 -0400
4.3@@ -0,0 +1,119 @@
4.4+;;; parquet.lisp --- Apache Parquet
4.5+
4.6+;; Common Lisp implementation of Apache Parquet
4.7+
4.8+;;; Commentary:
4.9+
4.10+#|
4.11+https://github.com/apache/parquet-format
4.12+https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
4.13+https://github.com/apache/parquet-testing
4.14+https://github.com/apache/parquet-java
4.15+https://github.com/apache/arrow-rs
4.16+|#
4.17+
4.18+;; In this package we're being as lazy as possible. To generate our own
4.19+;; encoder/decoder methods we depend on the file parquet.thrift in the
4.20+;; parquet-format repo above. The core skelfile includes a script to download
4.21+;; it and convert it to parquet.json (requires the thirft cli tool). We then
4.22+;; decode it with DAT/JSON and visit all elements recursively, generating lisp
4.23+;; code using pre-compiled macros.
4.24+
4.25+;;
4.26+;;; Code:
4.27+(in-package :dat/parquet)
4.28+
4.29+(eval-always
4.30+ (defparameter *default-parquet-json-file*
4.31+ (probe-file #.(asdf:system-relative-pathname :prelude #P"../.stash/parquet.json")))
4.32+ (defvar *parquet-json* nil)
4.33+ (defun load-parquet-json (&optional (json-file *default-parquet-json-file*))
4.34+ (with-open-file (file json-file)
4.35+ (setq *parquet-json* (json-read file))))
4.36+ (defun parquet-json-enums ()
4.37+ (json-getf *parquet-json* "enums"))
4.38+
4.39+ (defun parquet-json-enum-getf (name)
4.40+ (json-getf
4.41+ (find-if (lambda (x) (equal name (json-getf x "name"))) (parquet-json-enums))
4.42+ "members"))
4.43+
4.44+ (defmacro def-parquet-enum (sym name)
4.45+ `(progn
4.46+ (defvar ,(symbolicate '*parquet- sym '*) nil)
4.47+ (defun ,(symbolicate 'parquet-json- sym) ()
4.48+ (mapcar (lambda (x) (json-getf x "name")) (parquet-json-enum-getf ,name)))))
4.49+
4.50+ (defvar *parquet-structs* nil)
4.51+ (defstruct (parquet-struct
4.52+ (:constructor make-parquet-struct (name doc exceptionp unionp fields)))
4.53+ name doc exceptionp unionp (fields nil :type list))
4.54+ (defstruct (parquet-struct-field
4.55+ (:constructor make-parquet-struct-field (key name type-id type doc required)))
4.56+ key name type-id type doc required)
4.57+ (defun parquet-json-structs () ;; name doc isException isUnion fields
4.58+ (mapcar
4.59+ (lambda (s)
4.60+ (let ((name (json-getf s "name"))
4.61+ (doc (json-getf s "doc"))
4.62+ (exceptionp (json-getf s "isException"))
4.63+ (unionp (json-getf s "isUnion"))
4.64+ (fields (loop for f in (json-getf s "fields")
4.65+ collect
4.66+ (let ((key (json-getf f "key"))
4.67+ (name (json-getf f "name"))
4.68+ (type-id (json-getf f "typeId"))
4.69+ ;; json object - needs additional parsing
4.70+ (type (print (json-getf f "type")))
4.71+ (doc (json-getf f "doc"))
4.72+ (required (json-getf f "required")))
4.73+ (make-parquet-struct-field key name type-id type doc required)))))
4.74+ (make-parquet-struct name doc exceptionp unionp fields) *parquet-structs*))
4.75+ (json-getf *parquet-json* "structs")))
4.76+
4.77+ (defun parquet-json-namespaces ()
4.78+ (json-getf *parquet-json* "namespaces"))
4.79+
4.80+ (defun init-parquet-json ()
4.81+ (load-parquet-json)
4.82+ (def-parquet-enum types "Type")
4.83+ (def-parquet-enum converted-types "ConvertedType")
4.84+ (def-parquet-enum field-repetition-types "FieldRepetitionType")
4.85+ (def-parquet-enum encodings "Encoding")
4.86+ (def-parquet-enum compression-codecs "CompressionCodec")
4.87+ (def-parquet-enum page-types "PageType")
4.88+ (def-parquet-enum boundary-orders "BoundaryOrder")
4.89+ (setq *parquet-structs* (parquet-json-structs))))
4.90+
4.91+(eval-when (:compile-toplevel)
4.92+ (init-parquet-json))
4.93+
4.94+(defclass parquet-object () ())
4.95+
4.96+(defmethod print-object ((obj parquet-object) stream)
4.97+ "Output a Parquet object to a stream."
4.98+ (print-unreadable-object (obj stream :type t)
4.99+ (parquet-encode obj stream)))
4.100+
4.101+(defmacro define-parquet-class (name superclasses slots &rest options)
4.102+ "Define a new subclass of PARQUET-OBJECT with NAME."
4.103+ `(defclass ,name ,(push 'parquet-object superclasses) ,slots ,@options))
4.104+
4.105+(define-parquet-class logical-parquet-object () ())
4.106+
4.107+(defgeneric parquet-read (value &optional stream))
4.108+(defgeneric parquet-write (value &optional stream))
4.109+
4.110+(defmethod parquet-write ((value (eql t)) &optional stream)
4.111+ "Encode a parquet boolean true value."
4.112+ (declare (ignore value))
4.113+ (write-byte 1 stream))
4.114+
4.115+(defmethod parquet-write ((value (eql nil)) &optional stream)
4.116+ "Encode a parquet boolean false value."
4.117+ (declare (ignore value))
4.118+ (write-byte 0 stream))
4.119+
4.120+(defun parquet-encode (value &optional stream)
4.121+ "Encode a Lisp value and write it to a parquet stream."
4.122+ (parquet-write value stream))