changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / lisp/lib/dat/parquet/thrift.lisp

changeset 637: b88bf15f60d0
parent: 849f72b72b41
child: 642b3b82b20d
author: Richard Westhaver <ellis@rwest.io>
date: Wed, 04 Sep 2024 22:02:21 -0400
permissions: -rw-r--r--
description: parquet tweaks, import ox-man
1 ;;; /home/ellis/comp/core/lisp/lib/dat/parquet/thrift.lisp --- Parquet Thrift Definitions -*- buffer-read-only:t -*-
2 
3 ;; input = /home/ellis/comp/core/.stash/parquet.json
4 
5 ;; This file was generated automatically by
6 ;; DAT/PARQUET/GEN:PARSE-PARQUET-THRIFT-DEFINITIONS
7 
8 ;; Do not modify.
9 
10 ;;; Code:
11 (in-package :dat/parquet)
12 
13 (defun parquet-json-types ()
14  (mapcar
15  (lambda (x)
16  (keywordicate (snakecase-name-to-lisp-name (json-getf x "name"))))
17  (parquet-json-enum-getf "Type")))
18 (defparameter *parquet-types* (parquet-json-types))
19 (defun parquet-json-converted-types ()
20  (mapcar
21  (lambda (x)
22  (keywordicate (snakecase-name-to-lisp-name (json-getf x "name"))))
23  (parquet-json-enum-getf "ConvertedType")))
24 (defparameter *parquet-converted-types* (parquet-json-converted-types))
25 (defun parquet-json-field-repetition-types ()
26  (mapcar
27  (lambda (x)
28  (keywordicate (snakecase-name-to-lisp-name (json-getf x "name"))))
29  (parquet-json-enum-getf "FieldRepetitionType")))
30 (defparameter *parquet-field-repetition-types*
31  (parquet-json-field-repetition-types))
32 (defun parquet-json-encodings ()
33  (mapcar
34  (lambda (x)
35  (keywordicate (snakecase-name-to-lisp-name (json-getf x "name"))))
36  (parquet-json-enum-getf "Encoding")))
37 (defparameter *parquet-encodings* (parquet-json-encodings))
38 (defun parquet-json-compression-codecs ()
39  (mapcar
40  (lambda (x)
41  (keywordicate (snakecase-name-to-lisp-name (json-getf x "name"))))
42  (parquet-json-enum-getf "CompressionCodec")))
43 (defparameter *parquet-compression-codecs* (parquet-json-compression-codecs))
44 (defun parquet-json-page-types ()
45  (mapcar
46  (lambda (x)
47  (keywordicate (snakecase-name-to-lisp-name (json-getf x "name"))))
48  (parquet-json-enum-getf "PageType")))
49 (defparameter *parquet-page-types* (parquet-json-page-types))
50 (defun parquet-json-boundary-orders ()
51  (mapcar
52  (lambda (x)
53  (keywordicate (snakecase-name-to-lisp-name (json-getf x "name"))))
54  (parquet-json-enum-getf "BoundaryOrder")))
55 (defparameter *parquet-boundary-orders* (parquet-json-boundary-orders))
56 (eval-when (:compile-toplevel :load-toplevel :execute)
57  (sb-impl::%deftype 'parquet-boolean
58  (sb-impl::constant-type-expander 'parquet-boolean
59  (progn 'boolean))
60  (sb-c:source-location)))
61 (eval-when (:compile-toplevel :load-toplevel :execute)
62  (sb-impl::%deftype 'parquet-int32
63  (sb-impl::constant-type-expander 'parquet-int32
64  (progn
65  '(signed-byte 32)))
66  (sb-c:source-location)))
67 (eval-when (:compile-toplevel :load-toplevel :execute)
68  (sb-impl::%deftype 'parquet-int64
69  (sb-impl::constant-type-expander 'parquet-int64
70  (progn
71  '(signed-byte 64)))
72  (sb-c:source-location)))
73 (eval-when (:compile-toplevel :load-toplevel :execute)
74  (sb-impl::%deftype 'parquet-int96
75  (sb-impl::constant-type-expander 'parquet-int96
76  (progn
77  '(signed-byte 96)))
78  (sb-c:source-location)))
79 (eval-when (:compile-toplevel :load-toplevel :execute)
80  (sb-impl::%deftype 'parquet-float
81  (sb-impl::constant-type-expander 'parquet-float
82  (progn 'float))
83  (sb-c:source-location)))
84 (eval-when (:compile-toplevel :load-toplevel :execute)
85  (sb-impl::%deftype 'parquet-double
86  (sb-impl::constant-type-expander 'parquet-double
87  (progn 'double-float))
88  (sb-c:source-location)))
89 (eval-when (:compile-toplevel :load-toplevel :execute)
90  (sb-impl::%deftype 'parquet-byte-array
91  (sb-int:named-lambda (sb-impl::type-expander
92  parquet-byte-array)
93  (#:expr)
94  (declare (sb-c::lambda-list (&optional size)))
95  (sb-int:named-ds-bind (:macro parquet-byte-array
96  . deftype)
97  (&optional size)
98  (cdr #:expr)
99  (declare (sb-c::constant-value size))
100  (block parquet-byte-array `(octet-vector ,size))))
101  nil))
102 (eval-when (:compile-toplevel :load-toplevel :execute)
103  (sb-impl::%deftype 'parquet-fixed-len-byte-array
104  (sb-int:named-lambda (sb-impl::type-expander
105  parquet-fixed-len-byte-array)
106  (#:expr)
107  (declare (sb-c::lambda-list (size)))
108  (sb-int:named-ds-bind (:macro
109  parquet-fixed-len-byte-array
110  . deftype)
111  (size)
112  (cdr #:expr)
113  (declare (sb-c::constant-value size))
114  (block parquet-fixed-len-byte-array
115  `(octet-vector ,size))))
116  nil))
117 (defclass parquet-size-statistics (dat/parquet:parquet-object)
118  ((unencoded-byte-array-data-bytes :documentation
119  "The number of physical bytes stored for BYTE_ARRAY data values assuming
120 no encoding. This is exclusive of the bytes needed to store the length of
121 each byte array. In other words, this field is equivalent to the `(size
122 of PLAIN-ENCODING the byte array values) - (4 bytes * number of values
123 written)`. To determine unencoded sizes of other types readers can use
124 schema information multiplied by the number of non-null and null values.
125 The number of null\\non-null values can be inferred from the histograms
126 below.
127 
128 For example, if a column chunk is dictionary-encoded with dictionary
129 [\\a\\, \\bc\\, \\cde\\], and a data page contains the indices [0, 0, 1, 2],
130 then this value for that data page should be 7 (1 + 1 + 2 + 3).
131 
132 This field should only be set for types that use BYTE_ARRAY as their
133 physical type.
134 "
135  :initarg :unencoded-byte-array-data-bytes :initform nil :type
136  (or null (signed-byte 64)))
137  (repetition-level-histogram :documentation
138  "When present, there is expected to be one element corresponding to each
139 repetition (i.e. size=max repetition_level+1) where each element
140 represents the number of times the repetition level was observed in the
141 data.
142 
143 This field may be omitted if max_repetition_level is 0 without loss
144 of information.
145 
146 "
147  :initarg :repetition-level-histogram :initform nil :type
148  (or null (vector (signed-byte 64))))
149  (definition-level-histogram :documentation
150  "Same as repetition_level_histogram except for definition levels.
151 
152 This field may be omitted if max_definition_level is 0 or 1 without
153 loss of information.
154 
155 "
156  :initarg :definition-level-histogram :initform nil :type
157  (or null (vector (signed-byte 64)))))
158  (:documentation
159  "A structure for capturing metadata for estimating the unencoded,
160 uncompressed size of data written. This is useful for readers to estimate
161 how much memory is needed to reconstruct data in their memory model and for
162 fine grained filter pushdown on nested structures (the histograms contained
163 in this structure can help determine the number of nulls at a particular
164 nesting level and maximum length of lists).
165 "))
166 (defclass parquet-statistics (dat/parquet:parquet-object)
167  ((max :documentation
168  "DEPRECATED: min and max value of the column. Use min_value and max_value.
169 
170 Values are encoded using PLAIN encoding, except that variable-length byte
171 arrays do not include a length prefix.
172 
173 These fields encode min and max values determined by signed comparison
174 only. New files should use the correct order for a column's logical type
175 and store the values in the min_value and max_value fields.
176 
177 To support older readers, these may be set when the column order is
178 signed.
179 "
180  :initarg :max :initform nil :type (or null octet-vector))
181  (min :initarg :min :initform nil :type (or null octet-vector))
182  (null-count :documentation "count of null value in the column
183 "
184  :initarg :null-count :initform nil :type
185  (or null (signed-byte 64)))
186  (distinct-count :documentation "count of distinct values occurring
187 "
188  :initarg :distinct-count :initform nil :type
189  (or null (signed-byte 64)))
190  (max-value :documentation
191  "Lower and upper bound values for the column, determined by its ColumnOrder.
192 
193 These may be the actual minimum and maximum values found on a page or column
194 chunk, but can also be (more compact) values that do not exist on a page or
195 column chunk. For example, instead of storing \\Blart Versenwald III\\, a writer
196 may set min_value=\\B\\, max_value=\\C\\. Such more compact values must still be
197 valid values within the column's logical type.
198 
199 Values are encoded using PLAIN encoding, except that variable-length byte
200 arrays do not include a length prefix.
201 "
202  :initarg :max-value :initform nil :type (or null octet-vector))
203  (min-value :initarg :min-value :initform nil :type
204  (or null octet-vector))
205  (is-max-value-exact :documentation
206  "If true, max_value is the actual maximum value for a column
207 "
208  :initarg :is-max-value-exact :initform nil :type (or null boolean))
209  (is-min-value-exact :documentation
210  "If true, min_value is the actual minimum value for a column
211 "
212  :initarg :is-min-value-exact :initform nil :type
213  (or null boolean)))
214  (:documentation "Statistics per row group and per page
215 All fields are optional.
216 "))
217 (defclass parquet-string-type (dat/parquet:parquet-object) nil
218  (:documentation "Empty structs to use as logical type annotations
219 "))
220 (defclass parquet-uuid-type (dat/parquet:parquet-object) nil)
221 (defclass parquet-map-type (dat/parquet:parquet-object) nil)
222 (defclass parquet-list-type (dat/parquet:parquet-object) nil)
223 (defclass parquet-enum-type (dat/parquet:parquet-object) nil)
224 (defclass parquet-date-type (dat/parquet:parquet-object) nil)
225 (defclass parquet-float16-type (dat/parquet:parquet-object) nil)
226 (defclass parquet-null-type (dat/parquet:parquet-object) nil
227  (:documentation
228  "Logical type to annotate a column that is always null.
229 
230 Sometimes when discovering the schema of existing data, values are always
231 null and the physical type can't be determined. This annotation signals
232 the case where the physical type was guessed from all null values.
233 "))
234 (defclass parquet-decimal-type (dat/parquet:parquet-object)
235  ((scale :initarg :scale :type (signed-byte 32))
236  (precision :initarg :precision :type (signed-byte 32)))
237  (:documentation "Decimal logical type annotation
238 
239 Scale must be zero or a positive integer less than or equal to the precision.
240 Precision must be a non-zero positive integer.
241 
242 To maintain forward-compatibility in v1, implementations using this logical
243 type must also set scale and precision on the annotated SchemaElement.
244 
245 Allowed for physical types: INT32, INT64, FIXED_LEN_BYTE_ARRAY, and BYTE_ARRAY.
246 "))
247 (defclass parquet-milli-seconds (dat/parquet:parquet-object) nil
248  (:documentation "Time units for logical types
249 "))
250 (defclass parquet-micro-seconds (dat/parquet:parquet-object) nil)
251 (defclass parquet-nano-seconds (dat/parquet:parquet-object) nil)
252 (defclass parquet-time-unit (dat/parquet:parquet-object)
253  ((millis :initarg :millis :initform nil :type
254  (or null parquet-milli-seconds))
255  (micros :initarg :micros :initform nil :type
256  (or null parquet-micro-seconds))
257  (nanos :initarg :nanos :initform nil :type
258  (or null parquet-nano-seconds))))
259 (defclass parquet-timestamp-type (dat/parquet:parquet-object)
260  ((isadjustedtoutc :initarg :isadjustedtoutc :type boolean)
261  (unit :initarg :unit :type parquet-time-unit))
262  (:documentation "Timestamp logical type annotation
263 
264 Allowed for physical types: INT64
265 "))
266 (defclass parquet-time-type (dat/parquet:parquet-object)
267  ((isadjustedtoutc :initarg :isadjustedtoutc :type boolean)
268  (unit :initarg :unit :type parquet-time-unit))
269  (:documentation "Time logical type annotation
270 
271 Allowed for physical types: INT32 (millis), INT64 (micros, nanos)
272 "))
273 (defclass parquet-int-type (dat/parquet:parquet-object)
274  ((bitwidth :initarg :bitwidth)
275  (issigned :initarg :issigned :type boolean))
276  (:documentation "Integer logical type annotation
277 
278 bitWidth must be 8, 16, 32, or 64.
279 
280 Allowed for physical types: INT32, INT64
281 "))
282 (defclass parquet-json-type (dat/parquet:parquet-object) nil
283  (:documentation "Embedded JSON logical type annotation
284 
285 Allowed for physical types: BYTE_ARRAY
286 "))
287 (defclass parquet-bson-type (dat/parquet:parquet-object) nil
288  (:documentation "Embedded BSON logical type annotation
289 
290 Allowed for physical types: BYTE_ARRAY
291 "))
292 (defclass parquet-logical-type (dat/parquet:parquet-object)
293  ((string :initarg :string :initform nil :type
294  (or null parquet-string-type))
295  (map :initarg :map :initform nil :type (or null parquet-map-type))
296  (list :initarg :list :initform nil :type
297  (or null parquet-list-type))
298  (enum :initarg :enum :initform nil :type
299  (or null parquet-enum-type))
300  (decimal :initarg :decimal :initform nil :type
301  (or null parquet-decimal-type))
302  (date :initarg :date :initform nil :type
303  (or null parquet-date-type))
304  (time
305  :initarg
306  :time
307  :initform
308  nil
309  :type
310  (or null parquet-time-type))
311  (timestamp :initarg :timestamp :initform nil :type
312  (or null parquet-timestamp-type))
313  (integer :initarg :integer :initform nil :type
314  (or null parquet-int-type))
315  (unknown :initarg :unknown :initform nil :type
316  (or null parquet-null-type))
317  (json :initarg :json :initform nil :type
318  (or null parquet-json-type))
319  (bson :initarg :bson :initform nil :type
320  (or null parquet-bson-type))
321  (uuid :initarg :uuid :initform nil :type
322  (or null parquet-uuid-type))
323  (float16 :initarg :float16 :initform nil :type
324  (or null parquet-float16-type)))
325  (:documentation "LogicalType annotations to replace ConvertedType.
326 
327 To maintain compatibility, implementations using LogicalType for a
328 SchemaElement must also set the corresponding ConvertedType (if any)
329 from the following table.
330 "))
331 (defclass parquet-schema-element (dat/parquet:parquet-object)
332  ((type :documentation
333  "Data type for this field. Not set if the current element is a non-leaf node
334 "
335  :initarg :type :initform nil :type (or null parquet-type))
336  (type-length :documentation
337  "If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the values.
338 Otherwise, if specified, this is the maximum bit length to store any of the values.
339 (e.g. a low cardinality INT col could have this set to 3). Note that this is
340 in the schema, and therefore fixed for the entire file.
341 "
342  :initarg :type-length :initform nil :type
343  (or null (signed-byte 32)))
344  (repetition-type :documentation
345  "repetition of the field. The root of the schema does not have a repetition_type.
346 All other nodes must have one
347 "
348  :initarg :repetition-type :initform nil :type
349  (or null parquet-field-repetition-type))
350  (name :documentation "Name of the field in the schema
351 "
352  :initarg :name :type string)
353  (num-children :documentation
354  "Nested fields. Since thrift does not support nested fields,
355 the nesting is flattened to a single list by a depth-first traversal.
356 The children count is used to construct the nested relationship.
357 This field is not set when the element is a primitive type
358 "
359  :initarg :num-children :initform nil :type
360  (or null (signed-byte 32)))
361  (converted-type :documentation
362  "DEPRECATED: When the schema is the result of a conversion from another model.
363 Used to record the original type to help with cross conversion.
364 
365 This is superseded by logicalType.
366 "
367  :initarg :converted-type :initform nil :type
368  (or null parquet-converted-type))
369  (scale :documentation
370  "DEPRECATED: Used when this column contains decimal data.
371 See the DECIMAL converted type for more details.
372 
373 This is superseded by using the DecimalType annotation in logicalType.
374 "
375  :initarg :scale :initform nil :type (or null (signed-byte 32)))
376  (precision :initarg :precision :initform nil :type
377  (or null (signed-byte 32)))
378  (field-id :documentation
379  "When the original schema supports field ids, this will save the
380 original field id in the parquet schema
381 "
382  :initarg :field-id :initform nil :type (or null (signed-byte 32)))
383  (logicaltype :documentation "The logical type of this SchemaElement
384 
385 LogicalType replaces ConvertedType, but ConvertedType is still required
386 for some logical types to ensure forward-compatibility in format v1.
387 "
388  :initarg :logicaltype :initform nil :type
389  (or null parquet-logical-type)))
390  (:documentation "Represents a element inside a schema definition.
391  - if it is a group (inner node) then type is undefined and num_children is defined
392  - if it is a primitive type (leaf) then type is defined and num_children is undefined
393 the nodes are listed in depth first traversal order.
394 "))
395 (defclass parquet-data-page-header (dat/parquet:parquet-object)
396  ((num-values :documentation
397  "Number of values, including NULLs, in this data page.
398 
399 If a OffsetIndex is present, a page must begin at a row
400 boundary (repetition_level = 0). Otherwise, pages may begin
401 within a row (repetition_level > 0).
402 
403 "
404  :initarg :num-values :type (signed-byte 32))
405  (encoding :documentation "Encoding used for this data page *
406 "
407  :initarg :encoding :type parquet-encoding)
408  (definition-level-encoding :documentation
409  "Encoding used for definition levels *
410 "
411  :initarg :definition-level-encoding :type parquet-encoding)
412  (repetition-level-encoding :documentation
413  "Encoding used for repetition levels *
414 "
415  :initarg :repetition-level-encoding :type parquet-encoding)
416  (statistics :documentation
417  "Optional statistics for the data in this page *
418 "
419  :initarg :statistics :initform nil :type
420  (or null parquet-statistics)))
421  (:documentation "Data page header
422 "))
423 (defclass parquet-index-page-header (dat/parquet:parquet-object) nil)
424 (defclass parquet-dictionary-page-header (dat/parquet:parquet-object)
425  ((num-values :documentation "Number of values in the dictionary *
426 "
427  :initarg :num-values :type (signed-byte 32))
428  (encoding :documentation "Encoding using this dictionary page *
429 "
430  :initarg :encoding :type parquet-encoding)
431  (is-sorted :documentation
432  "If true, the entries in the dictionary are sorted in ascending order *
433 "
434  :initarg :is-sorted :initform nil :type (or null boolean)))
435  (:documentation
436  "The dictionary page must be placed at the first position of the column chunk
437 if it is partly or completely dictionary encoded. At most one dictionary page
438 can be placed in a column chunk.
439 
440 "))
441 (defclass parquet-data-page-header-v2 (dat/parquet:parquet-object)
442  ((num-values :documentation
443  "Number of values, including NULLs, in this data page. *
444 "
445  :initarg :num-values :type (signed-byte 32))
446  (num-nulls :documentation "Number of NULL values, in this data page.
447 Number of non-null = num_values - num_nulls which is also the number of values in the data section *
448 "
449  :initarg :num-nulls :type (signed-byte 32))
450  (num-rows :documentation
451  "Number of rows in this data page. Every page must begin at a
452 row boundary (repetition_level = 0): rows must **not** be
453 split across page boundaries when using V2 data pages.
454 
455 "
456  :initarg :num-rows :type (signed-byte 32))
457  (encoding :documentation "Encoding used for data in this page *
458 "
459  :initarg :encoding :type parquet-encoding)
460  (definition-levels-byte-length :documentation
461  "Length of the definition levels
462 "
463  :initarg :definition-levels-byte-length :type (signed-byte 32))
464  (repetition-levels-byte-length :documentation
465  "Length of the repetition levels
466 "
467  :initarg :repetition-levels-byte-length :type (signed-byte 32))
468  (is-compressed :documentation "Whether the values are compressed.
469 Which means the section of the page between
470 definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included)
471 is compressed with the compression_codec.
472 If missing it is considered compressed
473 "
474  :initarg :is-compressed :initform nil :type (or null boolean))
475  (statistics :documentation
476  "Optional statistics for the data in this page *
477 "
478  :initarg :statistics :initform nil :type
479  (or null parquet-statistics)))
480  (:documentation
481  "New page format allowing reading levels without decompressing the data
482 Repetition and definition levels are uncompressed
483 The remaining section containing the data is compressed if is_compressed is true
484 
485 "))
486 (defclass parquet-split-block-algorithm (dat/parquet:parquet-object) nil
487  (:documentation "Block-based algorithm type annotation. *
488 "))
489 (defclass parquet-bloom-filter-algorithm (dat/parquet:parquet-object)
490  ((block :documentation
491  "Block-based Bloom filter. *
492 "
493  :initarg
494  :block
495  :initform
496  nil
497  :type
498  (or null parquet-split-block-algorithm)))
499  (:documentation "The algorithm used in Bloom filter. *
500 "))
501 (defclass parquet-xx-hash (dat/parquet:parquet-object) nil
502  (:documentation
503  "Hash strategy type annotation. xxHash is an extremely fast non-cryptographic hash
504 algorithm. It uses 64 bits version of xxHash.
505 
506 "))
507 (defclass parquet-bloom-filter-hash (dat/parquet:parquet-object)
508  ((xxhash :documentation "xxHash Strategy. *
509 "
510  :initarg :xxhash :initform nil :type (or null parquet-xx-hash)))
511  (:documentation
512  "The hash function used in Bloom filter. This function takes the hash of a column value
513 using plain encoding.
514 
515 "))
516 (defclass parquet-uncompressed (dat/parquet:parquet-object) nil
517  (:documentation "The compression used in the Bloom filter.
518 
519 "))
520 (defclass parquet-bloom-filter-compression (dat/parquet:parquet-object)
521  ((uncompressed :initarg :uncompressed :initform nil :type
522  (or null parquet-uncompressed))))
523 (defclass parquet-bloom-filter-header (dat/parquet:parquet-object)
524  ((numbytes :documentation "The size of bitset in bytes *
525 "
526  :initarg :numbytes :type (signed-byte 32))
527  (algorithm :documentation "The algorithm for setting bits. *
528 "
529  :initarg :algorithm :type parquet-bloom-filter-algorithm)
530  (hash :documentation "The hash function used for Bloom filter. *
531 "
532  :initarg :hash :type parquet-bloom-filter-hash)
533  (compression :documentation
534  "The compression used in the Bloom filter *
535 "
536  :initarg :compression :type parquet-bloom-filter-compression))
537  (:documentation
538  "Bloom filter header is stored at beginning of Bloom filter data of each column
539 and followed by its bitset.
540 
541 "))
542 (defclass parquet-page-header (dat/parquet:parquet-object)
543  ((type :documentation
544  "the type of the page: indicates which of the *_header fields is set *
545 "
546  :initarg :type :type parquet-page-type)
547  (uncompressed-page-size :documentation
548  "Uncompressed page size in bytes (not including this header) *
549 "
550  :initarg :uncompressed-page-size :type (signed-byte 32))
551  (compressed-page-size :documentation
552  "Compressed (and potentially encrypted) page size in bytes, not including this header *
553 "
554  :initarg :compressed-page-size :type (signed-byte 32))
555  (crc :documentation
556  "The 32-bit CRC checksum for the page, to be be calculated as follows:
557 
558 - The standard CRC32 algorithm is used (with polynomial 0x04C11DB7,
559  the same as in e.g. GZip).
560 - All page types can have a CRC (v1 and v2 data pages, dictionary pages,
561  etc.).
562 - The CRC is computed on the serialization binary representation of the page
563  (as written to disk), excluding the page header. For example, for v1
564  data pages, the CRC is computed on the concatenation of repetition levels,
565  definition levels and column values (optionally compressed, optionally
566  encrypted).
567 - The CRC computation therefore takes place after any compression
568  and encryption steps, if any.
569 
570 If enabled, this allows for disabling checksumming in HDFS if only a few
571 pages need to be read.
572 "
573  :initarg :crc :initform nil :type (or null (signed-byte 32)))
574  (data-page-header :initarg :data-page-header :initform nil :type
575  (or null parquet-data-page-header))
576  (index-page-header :initarg :index-page-header :initform nil :type
577  (or null parquet-index-page-header))
578  (dictionary-page-header :initarg :dictionary-page-header :initform
579  nil :type (or null parquet-dictionary-page-header))
580  (data-page-header-v2 :initarg :data-page-header-v2 :initform nil
581  :type (or null parquet-data-page-header-v2))))
582 (defclass parquet-key-value (dat/parquet:parquet-object)
583  ((key :initarg :key :type string)
584  (value :initarg :value :initform nil :type (or null string)))
585  (:documentation "Wrapper struct to store key values
586 "))
587 (defclass parquet-sorting-column (dat/parquet:parquet-object)
588  ((column-idx :documentation
589  "The ordinal position of the column (in this row group) *
590 "
591  :initarg :column-idx :type (signed-byte 32))
592  (descending :documentation
593  "If true, indicates this column is sorted in descending order. *
594 "
595  :initarg :descending :type boolean)
596  (nulls-first :documentation
597  "If true, nulls will come before non-null values, otherwise,
598 nulls go at the end.
599 "
600  :initarg :nulls-first :type boolean))
601  (:documentation "Sort order within a RowGroup of a leaf column
602 "))
603 (defclass parquet-page-encoding-stats (dat/parquet:parquet-object)
604  ((page-type :documentation "the page type (data\\dic\\...) *
605 "
606  :initarg :page-type :type parquet-page-type)
607  (encoding :documentation "encoding of the page *
608 "
609  :initarg :encoding :type parquet-encoding)
610  (count :documentation
611  "number of pages of this type with this encoding *
612 "
613  :initarg :count :type (signed-byte 32)))
614  (:documentation "statistics of a given page type and encoding
615 "))
616 (defclass parquet-column-meta-data (dat/parquet:parquet-object)
617  ((type :documentation "Type of this column *
618 "
619  :initarg :type :type parquet-type)
620  (encodings :documentation
621  "Set of all encodings used for this column. The purpose is to validate
622 whether we can decode those pages. *
623 "
624  :initarg :encodings :type (vector parquet-encoding))
625  (path-in-schema :documentation "Path in schema *
626 "
627  :initarg :path-in-schema :type (vector string))
628  (codec :documentation "Compression codec *
629 "
630  :initarg :codec :type parquet-compression-codec)
631  (num-values :documentation "Number of values in this column *
632 "
633  :initarg :num-values :type (signed-byte 64))
634  (total-uncompressed-size :documentation
635  "total byte size of all uncompressed pages in this column chunk (including the headers) *
636 "
637  :initarg :total-uncompressed-size :type (signed-byte 64))
638  (total-compressed-size :documentation
639  "total byte size of all compressed, and potentially encrypted, pages
640 in this column chunk (including the headers) *
641 "
642  :initarg :total-compressed-size :type (signed-byte 64))
643  (key-value-metadata :documentation "Optional key\\value metadata *
644 "
645  :initarg :key-value-metadata :initform nil :type
646  (or null (vector parquet-key-value)))
647  (data-page-offset :documentation
648  "Byte offset from beginning of file to first data page *
649 "
650  :initarg :data-page-offset :type (signed-byte 64))
651  (index-page-offset :documentation
652  "Byte offset from beginning of file to root index page *
653 "
654  :initarg :index-page-offset :initform nil :type
655  (or null (signed-byte 64)))
656  (dictionary-page-offset :documentation
657  "Byte offset from the beginning of file to first (only) dictionary page *
658 "
659  :initarg :dictionary-page-offset :initform nil :type
660  (or null (signed-byte 64)))
661  (statistics :documentation "optional statistics for this column chunk
662 "
663  :initarg :statistics :initform nil :type
664  (or null parquet-statistics))
665  (encoding-stats :documentation
666  "Set of all encodings used for pages in this column chunk.
667 This information can be used to determine if all data pages are
668 dictionary encoded for example *
669 "
670  :initarg :encoding-stats :initform nil :type
671  (or null (vector parquet-page-encoding-stats)))
672  (bloom-filter-offset :documentation
673  "Byte offset from beginning of file to Bloom filter data. *
674 "
675  :initarg :bloom-filter-offset :initform nil :type
676  (or null (signed-byte 64)))
677  (bloom-filter-length :documentation
678  "Size of Bloom filter data including the serialized header, in bytes.
679 Added in 2.10 so readers may not read this field from old files and
680 it can be obtained after the BloomFilterHeader has been deserialized.
681 Writers should write this field so readers can read the bloom filter
682 in a single I\\O.
683 "
684  :initarg :bloom-filter-length :initform nil :type
685  (or null (signed-byte 32)))
686  (size-statistics :documentation
687  "Optional statistics to help estimate total memory when converted to in-memory
688 representations. The histograms contained in these statistics can
689 also be useful in some cases for more fine-grained nullability\\list length
690 filter pushdown.
691 "
692  :initarg :size-statistics :initform nil :type
693  (or null parquet-size-statistics)))
694  (:documentation "Description for column metadata
695 "))
696 (defclass parquet-encryption-with-footer-key (dat/parquet:parquet-object) nil)
697 (defclass parquet-encryption-with-column-key (dat/parquet:parquet-object)
698  ((path-in-schema :documentation "Column path in schema *
699 "
700  :initarg :path-in-schema :type (vector string))
701  (key-metadata :documentation
702  "Retrieval metadata of column encryption key *
703 "
704  :initarg :key-metadata :initform nil :type (or null octet-vector))))
705 (defclass parquet-column-crypto-meta-data (dat/parquet:parquet-object)
706  ((encryption-with-footer-key :initarg :encryption-with-footer-key
707  :initform nil :type (or null parquet-encryption-with-footer-key))
708  (encryption-with-column-key :initarg :encryption-with-column-key
709  :initform nil :type (or null parquet-encryption-with-column-key))))
710 (defclass parquet-column-chunk (dat/parquet:parquet-object)
711  ((file-path :documentation
712  "File where column data is stored. If not set, assumed to be same file as
713 metadata. This path is relative to the current file.
714 
715 "
716  :initarg :file-path :initform nil :type (or null string))
717  (file-offset :documentation
718  "Deprecated: Byte offset in file_path to the ColumnMetaData
719 
720 Past use of this field has been inconsistent, with some implementations
721 using it to point to the ColumnMetaData and some using it to point to
722 the first page in the column chunk. In many cases, the ColumnMetaData at this
723 location is wrong. This field is now deprecated and should not be used.
724 Writers should set this field to 0 if no ColumnMetaData has been written outside
725 the footer.
726 "
727  :initarg :file-offset :type (signed-byte 64))
728  (meta-data :documentation
729  "Column metadata for this chunk. Some writers may also replicate this at the
730 location pointed to by file_path\\file_offset.
731 Note: while marked as optional, this field is in fact required by most major
732 Parquet implementations. As such, writers MUST populate this field.
733 
734 "
735  :initarg :meta-data :initform nil :type
736  (or null parquet-column-meta-data))
737  (offset-index-offset :documentation
738  "File offset of ColumnChunk's OffsetIndex *
739 "
740  :initarg :offset-index-offset :initform nil :type
741  (or null (signed-byte 64)))
742  (offset-index-length :documentation
743  "Size of ColumnChunk's OffsetIndex, in bytes *
744 "
745  :initarg :offset-index-length :initform nil :type
746  (or null (signed-byte 32)))
747  (column-index-offset :documentation
748  "File offset of ColumnChunk's ColumnIndex *
749 "
750  :initarg :column-index-offset :initform nil :type
751  (or null (signed-byte 64)))
752  (column-index-length :documentation
753  "Size of ColumnChunk's ColumnIndex, in bytes *
754 "
755  :initarg :column-index-length :initform nil :type
756  (or null (signed-byte 32)))
757  (crypto-metadata :documentation
758  "Crypto metadata of encrypted columns *
759 "
760  :initarg :crypto-metadata :initform nil :type
761  (or null parquet-column-crypto-meta-data))
762  (encrypted-column-metadata :documentation
763  "Encrypted column metadata for this chunk *
764 "
765  :initarg :encrypted-column-metadata :initform nil :type
766  (or null octet-vector))))
767 (defclass parquet-row-group (dat/parquet:parquet-object)
768  ((columns :documentation
769  "Metadata for each column chunk in this row group.
770 This list must have the same order as the SchemaElement list in FileMetaData.
771 
772 "
773  :initarg :columns :type (vector parquet-column-chunk))
774  (total-byte-size :documentation
775  "Total byte size of all the uncompressed column data in this row group *
776 "
777  :initarg :total-byte-size :type (signed-byte 64))
778  (num-rows :documentation "Number of rows in this row group *
779 "
780  :initarg :num-rows :type (signed-byte 64))
781  (sorting-columns :documentation
782  "If set, specifies a sort ordering of the rows in this RowGroup.
783 The sorting columns can be a subset of all the columns.
784 "
785  :initarg :sorting-columns :initform nil :type
786  (or null (vector parquet-sorting-column)))
787  (file-offset :documentation
788  "Byte offset from beginning of file to first page (data or dictionary)
789 in this row group *
790 "
791  :initarg :file-offset :initform nil :type
792  (or null (signed-byte 64)))
793  (total-compressed-size :documentation
794  "Total byte size of all compressed (and potentially encrypted) column data
795 in this row group *
796 "
797  :initarg :total-compressed-size :initform nil :type
798  (or null (signed-byte 64)))
799  (ordinal :documentation "Row group ordinal in the file *
800 "
801  :initarg :ordinal :initform nil :type (or null (signed-byte 16)))))
802 (defclass parquet-type-defined-order (dat/parquet:parquet-object) nil
803  (:documentation
804  "Empty struct to signal the order defined by the physical or logical type
805 "))
806 (defclass parquet-column-order (dat/parquet:parquet-object)
807  ((type-order :documentation "The sort orders for logical types are:
808  UTF8 - unsigned byte-wise comparison
809  INT8 - signed comparison
810  INT16 - signed comparison
811  INT32 - signed comparison
812  INT64 - signed comparison
813  UINT8 - unsigned comparison
814  UINT16 - unsigned comparison
815  UINT32 - unsigned comparison
816  UINT64 - unsigned comparison
817  DECIMAL - signed comparison of the represented value
818  DATE - signed comparison
819  TIME_MILLIS - signed comparison
820  TIME_MICROS - signed comparison
821  TIMESTAMP_MILLIS - signed comparison
822  TIMESTAMP_MICROS - signed comparison
823  INTERVAL - undefined
824  JSON - unsigned byte-wise comparison
825  BSON - unsigned byte-wise comparison
826  ENUM - unsigned byte-wise comparison
827  LIST - undefined
828  MAP - undefined
829 
830 In the absence of logical types, the sort order is determined by the physical type:
831  BOOLEAN - false, true
832  INT32 - signed comparison
833  INT64 - signed comparison
834  INT96 (only used for legacy timestamps) - undefined
835  FLOAT - signed comparison of the represented value (*)
836  DOUBLE - signed comparison of the represented value (*)
837  BYTE_ARRAY - unsigned byte-wise comparison
838  FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison
839 
840 (*) Because the sorting order is not specified properly for floating
841  point values (relations vs. total ordering) the following
842  compatibility rules should be applied when reading statistics:
843  - If the min is a NaN, it should be ignored.
844  - If the max is a NaN, it should be ignored.
845  - If the min is +0, the row group may contain -0 values as well.
846  - If the max is -0, the row group may contain +0 values as well.
847  - When looking for NaN values, min and max should be ignored.
848 
849  When writing statistics the following rules should be followed:
850  - NaNs should not be written to min or max statistics fields.
851  - If the computed max value is zero (whether negative or positive),
852  `+0.0` should be written into the max statistics field.
853  - If the computed min value is zero (whether negative or positive),
854  `-0.0` should be written into the min statistics field.
855 "
856  :initarg :type-order :initform nil :type
857  (or null parquet-type-defined-order)))
858  (:documentation
859  "Union to specify the order used for the min_value and max_value fields for a
860 column. This union takes the role of an enhanced enum that allows rich
861 elements (which will be needed for a collation-based ordering in the future).
862 
863 Possible values are:
864 * TypeDefinedOrder - the column uses the order defined by its logical or
865  physical type (if there is no logical type).
866 
867 If the reader does not support the value of this union, min and max stats
868 for this column should be ignored.
869 "))
870 (defclass parquet-page-location (dat/parquet:parquet-object)
871  ((offset :documentation "Offset of the page in the file *
872 "
873  :initarg :offset :type (signed-byte 64))
874  (compressed-page-size :documentation
875  "Size of the page, including header. Sum of compressed_page_size and header
876 length
877 "
878  :initarg :compressed-page-size :type (signed-byte 32))
879  (first-row-index :documentation
880  "Index within the RowGroup of the first row of the page. When an
881 OffsetIndex is present, pages must begin on row boundaries
882 (repetition_level = 0).
883 "
884  :initarg :first-row-index :type (signed-byte 64))))
885 (defclass parquet-offset-index (dat/parquet:parquet-object)
886  ((page-locations :documentation
887  "PageLocations, ordered by increasing PageLocation.offset. It is required
888 that page_locations[i].first_row_index < page_locations[i+1].first_row_index.
889 "
890  :initarg :page-locations :type (vector parquet-page-location))
891  (unencoded-byte-array-data-bytes :documentation
892  "Unencoded\\uncompressed size for BYTE_ARRAY types.
893 
894 See documention for unencoded_byte_array_data_bytes in SizeStatistics for
895 more details on this field.
896 "
897  :initarg :unencoded-byte-array-data-bytes :initform nil :type
898  (or null (vector (signed-byte 64)))))
899  (:documentation "Optional offsets for each data page in a ColumnChunk.
900 
901 Forms part of the page index, along with ColumnIndex.
902 
903 OffsetIndex may be present even if ColumnIndex is not.
904 "))
905 (defclass parquet-column-index (dat/parquet:parquet-object)
906  ((null-pages :documentation
907  "A list of Boolean values to determine the validity of the corresponding
908 min and max values. If true, a page contains only null values, and writers
909 have to set the corresponding entries in min_values and max_values to
910 byte[0], so that all lists have the same length. If false, the
911 corresponding entries in min_values and max_values must be valid.
912 "
913  :initarg :null-pages :type (vector boolean))
914  (min-values :documentation
915  "Two lists containing lower and upper bounds for the values of each page
916 determined by the ColumnOrder of the column. These may be the actual
917 minimum and maximum values found on a page, but can also be (more compact)
918 values that do not exist on a page. For example, instead of storing \\\\Blart
919 Versenwald III\\, a writer may set min_values[i]=\\B\\, max_values[i]=\\C\\.
920 Such more compact values must still be valid values within the column's
921 logical type. Readers must make sure that list entries are populated before
922 using them by inspecting null_pages.
923 "
924  :initarg :min-values :type (vector octet-vector))
925  (max-values :initarg :max-values :type (vector octet-vector))
926  (boundary-order :documentation
927  "Stores whether both min_values and max_values are ordered and if so, in
928 which direction. This allows readers to perform binary searches in both
929 lists. Readers cannot assume that max_values[i] <= min_values[i+1], even
930 if the lists are ordered.
931 "
932  :initarg :boundary-order :type parquet-boundary-order)
933  (null-counts :documentation
934  "A list containing the number of null values for each page *
935 "
936  :initarg :null-counts :initform nil :type
937  (or null (vector (signed-byte 64))))
938  (repetition-level-histograms :documentation
939  "Contains repetition level histograms for each page
940 concatenated together. The repetition_level_histogram field on
941 SizeStatistics contains more details.
942 
943 When present the length should always be (number of pages *
944 (max_repetition_level + 1)) elements.
945 
946 Element 0 is the first element of the histogram for the first page.
947 Element (max_repetition_level + 1) is the first element of the histogram
948 for the second page.
949 
950 "
951  :initarg :repetition-level-histograms :initform nil :type
952  (or null (vector (signed-byte 64))))
953  (definition-level-histograms :documentation
954  "Same as repetition_level_histograms except for definitions levels.
955 
956 "
957  :initarg :definition-level-histograms :initform nil :type
958  (or null (vector (signed-byte 64)))))
959  (:documentation
960  "Optional statistics for each data page in a ColumnChunk.
961 
962 Forms part the page index, along with OffsetIndex.
963 
964 If this structure is present, OffsetIndex must also be present.
965 
966 For each field in this structure, <field>[i] refers to the page at
967 OffsetIndex.page_locations[i]
968 "))
969 (defclass parquet-aes-gcm-v1 (dat/parquet:parquet-object)
970  ((aad-prefix :documentation "AAD prefix *
971 "
972  :initarg :aad-prefix :initform nil :type (or null octet-vector))
973  (aad-file-unique :documentation
974  "Unique file identifier part of AAD suffix *
975 "
976  :initarg :aad-file-unique :initform nil :type
977  (or null octet-vector))
978  (supply-aad-prefix :documentation
979  "In files encrypted with AAD prefix without storing it,
980 readers must supply the prefix *
981 "
982  :initarg :supply-aad-prefix :initform nil :type (or null boolean))))
983 (defclass parquet-aes-gcm-ctr-v1 (dat/parquet:parquet-object)
984  ((aad-prefix :documentation "AAD prefix *
985 "
986  :initarg :aad-prefix :initform nil :type (or null octet-vector))
987  (aad-file-unique :documentation
988  "Unique file identifier part of AAD suffix *
989 "
990  :initarg :aad-file-unique :initform nil :type
991  (or null octet-vector))
992  (supply-aad-prefix :documentation
993  "In files encrypted with AAD prefix without storing it,
994 readers must supply the prefix *
995 "
996  :initarg :supply-aad-prefix :initform nil :type (or null boolean))))
997 (defclass parquet-encryption-algorithm (dat/parquet:parquet-object)
998  ((aes-gcm-v1 :initarg :aes-gcm-v1 :initform nil :type
999  (or null parquet-aes-gcm-v1))
1000  (aes-gcm-ctr-v1 :initarg :aes-gcm-ctr-v1 :initform nil :type
1001  (or null parquet-aes-gcm-ctr-v1))))
1002 (defclass parquet-file-meta-data (dat/parquet:parquet-object)
1003  ((version :documentation "Version of this file *
1004 "
1005  :initarg :version :type (signed-byte 32))
1006  (schema :documentation
1007  "Parquet schema for this file. This schema contains metadata for all the columns.
1008 The schema is represented as a tree with a single root. The nodes of the tree
1009 are flattened to a list by doing a depth-first traversal.
1010 The column metadata contains the path in the schema for that column which can be
1011 used to map columns to nodes in the schema.
1012 The first element is the root *
1013 "
1014  :initarg :schema :type (vector parquet-schema-element))
1015  (num-rows :documentation "Number of rows in this file *
1016 "
1017  :initarg :num-rows :type (signed-byte 64))
1018  (row-groups :documentation "Row groups in this file *
1019 "
1020  :initarg :row-groups :type (vector parquet-row-group))
1021  (key-value-metadata :documentation "Optional key\\value metadata *
1022 "
1023  :initarg :key-value-metadata :initform nil :type
1024  (or null (vector parquet-key-value)))
1025  (created-by :documentation
1026  "String for application that wrote this file. This should be in the format
1027 <Application> version <App Version> (build <App Build Hash>).
1028 e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)
1029 
1030 "
1031  :initarg :created-by :initform nil :type (or null string))
1032  (column-orders :documentation
1033  "Sort order used for the min_value and max_value fields in the Statistics
1034 objects and the min_values and max_values fields in the ColumnIndex
1035 objects of each column in this file. Sort orders are listed in the order
1036 matching the columns in the schema. The indexes are not necessary the same
1037 though, because only leaf nodes of the schema are represented in the list
1038 of sort orders.
1039 
1040 Without column_orders, the meaning of the min_value and max_value fields
1041 in the Statistics object and the ColumnIndex object is undefined. To ensure
1042 well-defined behaviour, if these fields are written to a Parquet file,
1043 column_orders must be written as well.
1044 
1045 The obsolete min and max fields in the Statistics object are always sorted
1046 by signed comparison regardless of column_orders.
1047 "
1048  :initarg :column-orders :initform nil :type
1049  (or null (vector parquet-column-order)))
1050  (encryption-algorithm :documentation
1051  "Encryption algorithm. This field is set only in encrypted files
1052 with plaintext footer. Files with encrypted footer store algorithm id
1053 in FileCryptoMetaData structure.
1054 "
1055  :initarg :encryption-algorithm :initform nil :type
1056  (or null parquet-encryption-algorithm))
1057  (footer-signing-key-metadata :documentation
1058  "Retrieval metadata of key used for signing the footer.
1059 Used only in encrypted files with plaintext footer.
1060 "
1061  :initarg :footer-signing-key-metadata :initform nil :type
1062  (or null octet-vector)))
1063  (:documentation "Description for file metadata
1064 "))
1065 (defclass parquet-file-crypto-meta-data (dat/parquet:parquet-object)
1066  ((encryption-algorithm :documentation
1067  "Encryption algorithm. This field is only used for files
1068 with encrypted footer. Files with plaintext footer store algorithm id
1069 inside footer (FileMetaData structure).
1070 "
1071  :initarg :encryption-algorithm :type parquet-encryption-algorithm)
1072  (key-metadata :documentation
1073  "Retrieval metadata of key used for encryption of footer,
1074 and (possibly) columns *
1075 "
1076  :initarg :key-metadata :initform nil :type (or null octet-vector)))
1077  (:documentation "Crypto metadata for files with encrypted footer *
1078 "))