changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / lisp/lib/dat/parquet/gen.lisp

changeset 635: 849f72b72b41
parent: 4d34907c69eb
child: b88bf15f60d0
author: Richard Westhaver <ellis@rwest.io>
date: Mon, 02 Sep 2024 18:31:19 -0400
permissions: -rw-r--r--
description: add back fuzz.lisp and proper codegen for parquet.json thrift definitions
1 ;;; gen.lisp --- Parquet Lisp Code Generator
2 
3 ;;
4 
5 ;;; Code:
6 (defpackage :dat/parquet/gen ;; not public API
7  (:use :cl :std :dat/proto :dat/json)
8  (:export :load-parquet))
9 
10 (in-package :dat/parquet/gen)
11 
12 (defparameter *parquet-json-file*
13  (or (probe-file #.(asdf:system-relative-pathname :prelude #P"../.stash/parquet.json"))
14  (warn "*PARQUET-JSON-FILE* not found")))
15 
16 (defparameter *parquet-output-file*
17  #.(asdf:system-relative-pathname :dat #P"parquet/thrift.lisp"))
18 
19 (defvar *parquet-json* nil)
20 
21 (defun %parquet-json-enums ()
22  (json-getf *parquet-json* "enums"))
23 
24 (defun parquet-json-enum-getf (name)
25  (json-getf
26  (find-if (lambda (x) (equal name (json-getf x "name"))) (%parquet-json-enums))
27  "members"))
28 
29 (defvar *parquet-enums* nil)
30 
31 (defmacro define-parquet-enum (sym name)
32  `(progn
33  (defun ,(symbolicate "PARQUET-JSON-" sym) ()
34  (mapcar (lambda (x) (keywordicate (snakecase-name-to-lisp-name (json-getf x "name"))))
35  (parquet-json-enum-getf ,name)))
36  (defparameter ,(intern
37  (concatenate 'string "*PARQUET-" (symbol-name sym) "*")
38  :dat/parquet)
39  (,(symbolicate "PARQUET-JSON-" sym)))))
40 
41 (defun camelcase-name-to-lisp-name (string)
42  (string-upcase
43  (with-output-to-string (name)
44  (loop for i from 0 below (length string)
45  for c across string
46  when (and (upper-case-p c) (not (zerop i)))
47  do (write-char #\- name)
48  do (write-char c name)))))
49 
50 (defun snakecase-name-to-lisp-name (string)
51  (string-upcase
52  (substitute #\- #\_ string)))
53 
54 (labels ((parse-type-id (type-id)
55  (when type-id
56  (string-case (type-id :default nil)
57  ("bool" 'boolean)
58  ("byte" 'signed-byte)
59  ("i16" '(signed-byte 16))
60  ("i32" '(signed-byte 32))
61  ("i64" '(signed-byte 64))
62  ("double" 'double-float)
63  ("string" 'string)
64  ("list" 'vector)
65  ("binary" 'octet-vector)
66  ("set" 'vector)
67  ("enum" '(signed-byte 32))
68  ("union" 'union)
69  ("struct" 'struct))))
70  (%intern (name)
71  (if (stringp name)
72  (intern
73  (cond
74  ((equal name "UUIDType") "PARQUET-UUID-TYPE")
75  (t (concatenate 'string
76  "PARQUET-"
77  (camelcase-name-to-lisp-name name))))
78  :dat/parquet)
79  name))
80  (parse-type (o)
81  (when o
82  (string-case ((json-getf o "typeId"))
83  ("union" (%intern (json-getf o "class")))
84  ("list"
85  (if-let ((elt (json-getf o "elemType" nil)))
86  (%intern (parse-type elt))
87  (parse-type-id (json-getf o "elemTypeId"))))
88  ("set"
89  (if-let ((elt (json-getf o "elemType" nil)))
90  (%intern (parse-type elt))
91  (parse-type-id (json-getf o "elemTypeId"))))
92  ("struct" (%intern (json-getf o "class")))
93  ("enum" (%intern (json-getf o "class")))))))
94  (defun convert-parquet-struct-field-type (field) ;; technically part of thrift type system
95  (let* ((type-id (parse-type-id (parquet-struct-field-type-id field)))
96  (type (parse-type (parquet-struct-field-type field)))
97  (required (parquet-struct-field-required field)))
98  (let ((ret (cond
99  ((eql 'vector type-id) `(vector ,type))
100  (t (or type type-id)))))
101  (if (equal "optional" required)
102  `(or null ,ret)
103  ret)))))
104 
105 (defparameter *parquet-structs* nil)
106 
107 (defstruct (parquet-struct
108  (:constructor make-parquet-struct (name doc exceptionp unionp fields)))
109  name doc exceptionp unionp (fields nil :type list))
110 
111 (defstruct (parquet-struct-field
112  (:constructor make-parquet-struct-field (key name type-id type doc required)))
113  key name type-id type doc required)
114 
115 (defun parquet-json-structs () ;; name doc isException isUnion fields
116  (mapcar
117  (lambda (s)
118  (let ((name (json-getf s "name"))
119  (doc (json-getf s "doc"))
120  (exceptionp (json-getf s "isException"))
121  (unionp (json-getf s "isUnion"))
122  (fields (loop for f in (json-getf s "fields")
123  collect
124  (let ((key (json-getf f "key"))
125  (name (json-getf f "name"))
126  (type-id (json-getf f "typeId"))
127  ;; json object - needs additional parsing
128  (type (json-getf f "type"))
129  (doc (json-getf f "doc"))
130  (required (json-getf f "required")))
131  (make-parquet-struct-field key name type-id type doc required)))))
132  (make-parquet-struct name doc exceptionp unionp fields)))
133  (json-getf *parquet-json* "structs")))
134 
135 (defun parquet-json-namespaces ()
136  (json-getf *parquet-json* "namespaces"))
137 
138 (eval-always
139  (defun init-parquet-json (&optional (file *parquet-json-file*))
140  (with-open-file (file file)
141  (setq *parquet-json* (json-read file)))
142  (setq *parquet-enums* (parquet-json-enums))
143  (setq *parquet-structs* (parquet-json-structs))))
144 
145 ;;; CLOS
146 
147 ;; (defmethod print-object ((obj parquet-object) stream)
148 ;; "Output a Parquet object to a stream."
149 ;; (print-unreadable-object (obj stream :type t)))
150 
151 (defmacro define-parquet-class (name superclasses slots &rest options)
152  "Define a new subclass of PARQUET-OBJECT with NAME."
153  `(defclass ,name ,@(if-let ((s superclasses)) (list s) `((dat/parquet::parquet-object))) ,slots ,@options))
154 
155 ;;; Codegen
156 
157 ;; 8)
158 (eval-always
159  (defun %define-parquet-structs ()
160  "Define all known values in *PARQUET-STRUCTS* using DEFINE-PARQUET-CLASS (DEFCLASS)."
161  (loop for struct in *parquet-structs*
162  unless (null struct)
163  collect (let* ((name (parquet-struct-name struct))
164  (doc (parquet-struct-doc struct))
165  (fields (parquet-struct-fields struct))
166  (class-name (intern (cond
167  ((equal name "UUIDType") "PARQUET-UUID-TYPE")
168  (t (concatenate 'string
169  "PARQUET-"
170  (camelcase-name-to-lisp-name name))))
171  :dat/parquet)))
172  `(progn
173  (defclass ,class-name (dat/parquet::parquet-object)
174  (,@(mapcar (lambda (f)
175  (let ((fdoc (parquet-struct-field-doc f))
176  (fname (snakecase-name-to-lisp-name
177  (parquet-struct-field-name f))))
178  `(,(intern fname :dat/parquet)
179  ,@(when fdoc `(:documentation ,fdoc))
180  :initarg ,(keywordicate fname)
181  ;; TODO 2024-07-12:
182  ,@(when (equal "optional" (parquet-struct-field-required f))
183  `(:initform nil))
184  ,@(when-let ((ty (convert-parquet-struct-field-type f)))
185  `(:type ,ty)))))
186  fields))
187  ,@(when doc `((:documentation ,doc)))))))))
188 
189 (defmacro define-parquet-type (name opts &body body)
190  "Define a parquet type with DEFTYPE which maps to LISP-TYPE."
191  `(deftype ,(intern (concatenate 'string "PARQUET-" (substitute #\- #\_ name)) :dat/parquet) ,opts ,@body))
192 
193 (defun parse-parquet-thrift-definitions (&key (input *parquet-json-file*)
194  (output #.(asdf:system-relative-pathname :dat "parquet/thrift.lisp")))
195  (init-parquet-json input)
196  (with-open-file (defs output :direction :output :if-exists :supersede :if-does-not-exist :create)
197  (format defs ";;; ~a --- Parquet Thrift Definitions -*- buffer-read-only:t -*-
198 
199 ;; input = ~a
200 
201 ;; This file was generated automatically by
202 ;; DAT/PARQUET/GEN:PARSE-PARQUET-THRIFT-DEFINITIONS
203 
204 ;; Do not modify.
205 
206 ;;; Code:
207 (in-package :dat/parquet)" output input)
208  (format defs "~2%")
209  (let ((enums '((define-parquet-enum types "Type")
210  (define-parquet-enum converted-types "ConvertedType")
211  (define-parquet-enum field-repetition-types "FieldRepetitionType")
212  (define-parquet-enum encodings "Encoding")
213  (define-parquet-enum compression-codecs "CompressionCodec")
214  (define-parquet-enum page-types "PageType")
215  (define-parquet-enum boundary-orders "BoundaryOrder")))
216  (types '((define-parquet-type "BOOLEAN" () 'boolean)
217  (define-parquet-type "INT32" () '(signed-byte 32))
218  (define-parquet-type "INT64" () '(signed-byte 64))
219  (define-parquet-type "INT96" () '(signed-byte 96))
220  (define-parquet-type "FLOAT" () 'float)
221  (define-parquet-type "DOUBLE" () 'double-float)
222  (define-parquet-type "BYTE_ARRAY" (&optional size) `(octet-vector ,size))
223  (define-parquet-type "FIXED_LEN_BYTE_ARRAY" (size) `(octet-vector ,size))))
224  (structs (mapcar #'macroexpand-1 (%define-parquet-structs))))
225  ;; expands to a progn, so we just take the cdr
226  (dolist (en enums)
227  (dolist (f (cdr (macroexpand en)))
228  (write f :stream defs :case :downcase :readably t)
229  (terpri defs)))
230  (dolist (ty types)
231  (dolist (f (cdr (macroexpand ty)))
232  (write f :stream defs :case :downcase :readably t)
233  (terpri defs)))
234  (dolist (st structs)
235  (dolist (f (cdr (macroexpand st)))
236  (write f :stream defs :case :downcase :readably t)
237  (terpri defs))))))