changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / lisp/lib/dat/parquet/thrift.lisp

changeset 640: 642b3b82b20d
parent: b88bf15f60d0
author: Richard Westhaver <ellis@rwest.io>
date: Sun, 08 Sep 2024 17:35:03 -0400
permissions: -rw-r--r--
description: thrift fixes, org-get-with-inheritance init
1 ;;; /home/ellis/comp/core/lisp/lib/dat/parquet/thrift.lisp --- Parquet Thrift Definitions -*- buffer-read-only:t -*-
2 
3 ;; input = /home/ellis/comp/core/.stash/parquet.json
4 
5 ;; This file was generated automatically by
6 ;; DAT/PARQUET/GEN:PARSE-PARQUET-THRIFT-DEFINITIONS
7 
8 ;; Do not modify.
9 
10 ;;; Code:
11 (in-package :dat/parquet)
12 
13 (defvar *parquet-json-types*
14  '(:boolean :int32 :int64 :int96 :float :double :byte-array
15  :fixed-len-byte-array))
16 (defvar *parquet-json-converted-types*
17  '(:utf8 :map :map-key-value :list :enum :decimal :date :time-millis
18  :time-micros :timestamp-millis :timestamp-micros :uint-8 :uint-16 :uint-32
19  :uint-64 :int-8 :int-16 :int-32 :int-64 :json :bson :interval))
20 (defvar *parquet-json-field-repetition-types* '(:required :optional :repeated))
21 (defvar *parquet-json-encodings*
22  '(:plain :plain-dictionary :rle :bit-packed :delta-binary-packed
23  :delta-length-byte-array :delta-byte-array :rle-dictionary
24  :byte-stream-split))
25 (defvar *parquet-json-compression-codecs*
26  '(:uncompressed :snappy :gzip :lzo :brotli :lz4 :zstd :lz4-raw))
27 (defvar *parquet-json-page-types*
28  '(:data-page :index-page :dictionary-page :data-page-v2))
29 (defvar *parquet-json-boundary-orders* '(:unordered :ascending :descending))
30 (deftype parquet-boolean () 'boolean)
31 (deftype parquet-int32 () '(signed-byte 32))
32 (deftype parquet-int64 () '(signed-byte 64))
33 (deftype parquet-int96 () '(signed-byte 96))
34 (deftype parquet-float () 'float)
35 (deftype parquet-double () 'double-float)
36 (deftype parquet-byte-array (&optional dat/parquet/gen::size)
37  `(octet-vector ,dat/parquet/gen::size))
38 (deftype parquet-fixed-len-byte-array (dat/parquet/gen::size)
39  `(octet-vector ,dat/parquet/gen::size))
40 (defclass parquet-size-statistics (parquet-object)
41  ((unencoded-byte-array-data-bytes :documentation
42  "The number of physical bytes stored for BYTE_ARRAY data values assuming
43 no encoding. This is exclusive of the bytes needed to store the length of
44 each byte array. In other words, this field is equivalent to the `(size
45 of PLAIN-ENCODING the byte array values) - (4 bytes * number of values
46 written)`. To determine unencoded sizes of other types readers can use
47 schema information multiplied by the number of non-null and null values.
48 The number of null\\non-null values can be inferred from the histograms
49 below.
50 
51 For example, if a column chunk is dictionary-encoded with dictionary
52 [\\a\\, \\bc\\, \\cde\\], and a data page contains the indices [0, 0, 1, 2],
53 then this value for that data page should be 7 (1 + 1 + 2 + 3).
54 
55 This field should only be set for types that use BYTE_ARRAY as their
56 physical type.
57 "
58  :initarg :unencoded-byte-array-data-bytes :initform nil :type
59  (or null (signed-byte 64)))
60  (repetition-level-histogram :documentation
61  "When present, there is expected to be one element corresponding to each
62 repetition (i.e. size=max repetition_level+1) where each element
63 represents the number of times the repetition level was observed in the
64 data.
65 
66 This field may be omitted if max_repetition_level is 0 without loss
67 of information.
68 
69 "
70  :initarg :repetition-level-histogram :initform nil :type
71  (or null (vector (signed-byte 64))))
72  (definition-level-histogram :documentation
73  "Same as repetition_level_histogram except for definition levels.
74 
75 This field may be omitted if max_definition_level is 0 or 1 without
76 loss of information.
77 
78 "
79  :initarg :definition-level-histogram :initform nil :type
80  (or null (vector (signed-byte 64)))))
81  (:documentation
82  "A structure for capturing metadata for estimating the unencoded,
83 uncompressed size of data written. This is useful for readers to estimate
84 how much memory is needed to reconstruct data in their memory model and for
85 fine grained filter pushdown on nested structures (the histograms contained
86 in this structure can help determine the number of nulls at a particular
87 nesting level and maximum length of lists).
88 "))
89 (defclass parquet-statistics (parquet-object)
90  ((max :documentation
91  "DEPRECATED: min and max value of the column. Use min_value and max_value.
92 
93 Values are encoded using PLAIN encoding, except that variable-length byte
94 arrays do not include a length prefix.
95 
96 These fields encode min and max values determined by signed comparison
97 only. New files should use the correct order for a column's logical type
98 and store the values in the min_value and max_value fields.
99 
100 To support older readers, these may be set when the column order is
101 signed.
102 "
103  :initarg :max :initform nil :type (or null octet-vector))
104  (min :initarg :min :initform nil :type (or null octet-vector))
105  (null-count :documentation "count of null value in the column
106 "
107  :initarg :null-count :initform nil :type
108  (or null (signed-byte 64)))
109  (distinct-count :documentation "count of distinct values occurring
110 "
111  :initarg :distinct-count :initform nil :type
112  (or null (signed-byte 64)))
113  (max-value :documentation
114  "Lower and upper bound values for the column, determined by its ColumnOrder.
115 
116 These may be the actual minimum and maximum values found on a page or column
117 chunk, but can also be (more compact) values that do not exist on a page or
118 column chunk. For example, instead of storing \\Blart Versenwald III\\, a writer
119 may set min_value=\\B\\, max_value=\\C\\. Such more compact values must still be
120 valid values within the column's logical type.
121 
122 Values are encoded using PLAIN encoding, except that variable-length byte
123 arrays do not include a length prefix.
124 "
125  :initarg :max-value :initform nil :type (or null octet-vector))
126  (min-value :initarg :min-value :initform nil :type
127  (or null octet-vector))
128  (is-max-value-exact :documentation
129  "If true, max_value is the actual maximum value for a column
130 "
131  :initarg :is-max-value-exact :initform nil :type (or null boolean))
132  (is-min-value-exact :documentation
133  "If true, min_value is the actual minimum value for a column
134 "
135  :initarg :is-min-value-exact :initform nil :type
136  (or null boolean)))
137  (:documentation "Statistics per row group and per page
138 All fields are optional.
139 "))
140 (defclass parquet-string-type (parquet-object) nil
141  (:documentation "Empty structs to use as logical type annotations
142 "))
143 (defclass parquet-uuid-type (parquet-object) nil)
144 (defclass parquet-map-type (parquet-object) nil)
145 (defclass parquet-list-type (parquet-object) nil)
146 (defclass parquet-enum-type (parquet-object) nil)
147 (defclass parquet-date-type (parquet-object) nil)
148 (defclass parquet-float16-type (parquet-object) nil)
149 (defclass parquet-null-type (parquet-object) nil
150  (:documentation
151  "Logical type to annotate a column that is always null.
152 
153 Sometimes when discovering the schema of existing data, values are always
154 null and the physical type can't be determined. This annotation signals
155 the case where the physical type was guessed from all null values.
156 "))
157 (defclass parquet-decimal-type (parquet-object)
158  ((scale :initarg :scale :type (signed-byte 32))
159  (precision :initarg :precision :type (signed-byte 32)))
160  (:documentation "Decimal logical type annotation
161 
162 Scale must be zero or a positive integer less than or equal to the precision.
163 Precision must be a non-zero positive integer.
164 
165 To maintain forward-compatibility in v1, implementations using this logical
166 type must also set scale and precision on the annotated SchemaElement.
167 
168 Allowed for physical types: INT32, INT64, FIXED_LEN_BYTE_ARRAY, and BYTE_ARRAY.
169 "))
170 (defclass parquet-milli-seconds (parquet-object) nil
171  (:documentation "Time units for logical types
172 "))
173 (defclass parquet-micro-seconds (parquet-object) nil)
174 (defclass parquet-nano-seconds (parquet-object) nil)
175 (defclass parquet-time-unit (parquet-object)
176  ((millis :initarg :millis :initform nil :type
177  (or null parquet-milli-seconds))
178  (micros :initarg :micros :initform nil :type
179  (or null parquet-micro-seconds))
180  (nanos :initarg :nanos :initform nil :type
181  (or null parquet-nano-seconds))))
182 (defclass parquet-timestamp-type (parquet-object)
183  ((isadjustedtoutc :initarg :isadjustedtoutc :type boolean)
184  (unit :initarg :unit :type parquet-time-unit))
185  (:documentation "Timestamp logical type annotation
186 
187 Allowed for physical types: INT64
188 "))
189 (defclass parquet-time-type (parquet-object)
190  ((isadjustedtoutc :initarg :isadjustedtoutc :type boolean)
191  (unit :initarg :unit :type parquet-time-unit))
192  (:documentation "Time logical type annotation
193 
194 Allowed for physical types: INT32 (millis), INT64 (micros, nanos)
195 "))
196 (defclass parquet-int-type (parquet-object)
197  ((bitwidth :initarg :bitwidth)
198  (issigned :initarg :issigned :type boolean))
199  (:documentation "Integer logical type annotation
200 
201 bitWidth must be 8, 16, 32, or 64.
202 
203 Allowed for physical types: INT32, INT64
204 "))
205 (defclass parquet-json-type (parquet-object) nil
206  (:documentation "Embedded JSON logical type annotation
207 
208 Allowed for physical types: BYTE_ARRAY
209 "))
210 (defclass parquet-bson-type (parquet-object) nil
211  (:documentation "Embedded BSON logical type annotation
212 
213 Allowed for physical types: BYTE_ARRAY
214 "))
215 (defclass parquet-logical-type (parquet-object)
216  ((string :initarg :string :initform nil :type
217  (or null parquet-string-type))
218  (map :initarg :map :initform nil :type (or null parquet-map-type))
219  (list :initarg :list :initform nil :type
220  (or null parquet-list-type))
221  (enum :initarg :enum :initform nil :type
222  (or null parquet-enum-type))
223  (decimal :initarg :decimal :initform nil :type
224  (or null parquet-decimal-type))
225  (date :initarg :date :initform nil :type
226  (or null parquet-date-type))
227  (time
228  :initarg
229  :time
230  :initform
231  nil
232  :type
233  (or null parquet-time-type))
234  (timestamp :initarg :timestamp :initform nil :type
235  (or null parquet-timestamp-type))
236  (integer :initarg :integer :initform nil :type
237  (or null parquet-int-type))
238  (unknown :initarg :unknown :initform nil :type
239  (or null parquet-null-type))
240  (json :initarg :json :initform nil :type
241  (or null parquet-json-type))
242  (bson :initarg :bson :initform nil :type
243  (or null parquet-bson-type))
244  (uuid :initarg :uuid :initform nil :type
245  (or null parquet-uuid-type))
246  (float16 :initarg :float16 :initform nil :type
247  (or null parquet-float16-type)))
248  (:documentation "LogicalType annotations to replace ConvertedType.
249 
250 To maintain compatibility, implementations using LogicalType for a
251 SchemaElement must also set the corresponding ConvertedType (if any)
252 from the following table.
253 "))
254 (defclass parquet-schema-element (parquet-object)
255  ((type :documentation
256  "Data type for this field. Not set if the current element is a non-leaf node
257 "
258  :initarg :type :initform nil :type (or null parquet-type))
259  (type-length :documentation
260  "If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the values.
261 Otherwise, if specified, this is the maximum bit length to store any of the values.
262 (e.g. a low cardinality INT col could have this set to 3). Note that this is
263 in the schema, and therefore fixed for the entire file.
264 "
265  :initarg :type-length :initform nil :type
266  (or null (signed-byte 32)))
267  (repetition-type :documentation
268  "repetition of the field. The root of the schema does not have a repetition_type.
269 All other nodes must have one
270 "
271  :initarg :repetition-type :initform nil :type
272  (or null parquet-field-repetition-type))
273  (name :documentation "Name of the field in the schema
274 "
275  :initarg :name :type string)
276  (num-children :documentation
277  "Nested fields. Since thrift does not support nested fields,
278 the nesting is flattened to a single list by a depth-first traversal.
279 The children count is used to construct the nested relationship.
280 This field is not set when the element is a primitive type
281 "
282  :initarg :num-children :initform nil :type
283  (or null (signed-byte 32)))
284  (converted-type :documentation
285  "DEPRECATED: When the schema is the result of a conversion from another model.
286 Used to record the original type to help with cross conversion.
287 
288 This is superseded by logicalType.
289 "
290  :initarg :converted-type :initform nil :type
291  (or null parquet-converted-type))
292  (scale :documentation
293  "DEPRECATED: Used when this column contains decimal data.
294 See the DECIMAL converted type for more details.
295 
296 This is superseded by using the DecimalType annotation in logicalType.
297 "
298  :initarg :scale :initform nil :type (or null (signed-byte 32)))
299  (precision :initarg :precision :initform nil :type
300  (or null (signed-byte 32)))
301  (field-id :documentation
302  "When the original schema supports field ids, this will save the
303 original field id in the parquet schema
304 "
305  :initarg :field-id :initform nil :type (or null (signed-byte 32)))
306  (logicaltype :documentation "The logical type of this SchemaElement
307 
308 LogicalType replaces ConvertedType, but ConvertedType is still required
309 for some logical types to ensure forward-compatibility in format v1.
310 "
311  :initarg :logicaltype :initform nil :type
312  (or null parquet-logical-type)))
313  (:documentation "Represents a element inside a schema definition.
314  - if it is a group (inner node) then type is undefined and num_children is defined
315  - if it is a primitive type (leaf) then type is defined and num_children is undefined
316 the nodes are listed in depth first traversal order.
317 "))
318 (defclass parquet-data-page-header (parquet-object)
319  ((num-values :documentation
320  "Number of values, including NULLs, in this data page.
321 
322 If a OffsetIndex is present, a page must begin at a row
323 boundary (repetition_level = 0). Otherwise, pages may begin
324 within a row (repetition_level > 0).
325 
326 "
327  :initarg :num-values :type (signed-byte 32))
328  (encoding :documentation "Encoding used for this data page *
329 "
330  :initarg :encoding :type parquet-encoding)
331  (definition-level-encoding :documentation
332  "Encoding used for definition levels *
333 "
334  :initarg :definition-level-encoding :type parquet-encoding)
335  (repetition-level-encoding :documentation
336  "Encoding used for repetition levels *
337 "
338  :initarg :repetition-level-encoding :type parquet-encoding)
339  (statistics :documentation
340  "Optional statistics for the data in this page *
341 "
342  :initarg :statistics :initform nil :type
343  (or null parquet-statistics)))
344  (:documentation "Data page header
345 "))
346 (defclass parquet-index-page-header (parquet-object) nil)
347 (defclass parquet-dictionary-page-header (parquet-object)
348  ((num-values :documentation "Number of values in the dictionary *
349 "
350  :initarg :num-values :type (signed-byte 32))
351  (encoding :documentation "Encoding using this dictionary page *
352 "
353  :initarg :encoding :type parquet-encoding)
354  (is-sorted :documentation
355  "If true, the entries in the dictionary are sorted in ascending order *
356 "
357  :initarg :is-sorted :initform nil :type (or null boolean)))
358  (:documentation
359  "The dictionary page must be placed at the first position of the column chunk
360 if it is partly or completely dictionary encoded. At most one dictionary page
361 can be placed in a column chunk.
362 
363 "))
364 (defclass parquet-data-page-header-v2 (parquet-object)
365  ((num-values :documentation
366  "Number of values, including NULLs, in this data page. *
367 "
368  :initarg :num-values :type (signed-byte 32))
369  (num-nulls :documentation "Number of NULL values, in this data page.
370 Number of non-null = num_values - num_nulls which is also the number of values in the data section *
371 "
372  :initarg :num-nulls :type (signed-byte 32))
373  (num-rows :documentation
374  "Number of rows in this data page. Every page must begin at a
375 row boundary (repetition_level = 0): rows must **not** be
376 split across page boundaries when using V2 data pages.
377 
378 "
379  :initarg :num-rows :type (signed-byte 32))
380  (encoding :documentation "Encoding used for data in this page *
381 "
382  :initarg :encoding :type parquet-encoding)
383  (definition-levels-byte-length :documentation
384  "Length of the definition levels
385 "
386  :initarg :definition-levels-byte-length :type (signed-byte 32))
387  (repetition-levels-byte-length :documentation
388  "Length of the repetition levels
389 "
390  :initarg :repetition-levels-byte-length :type (signed-byte 32))
391  (is-compressed :documentation "Whether the values are compressed.
392 Which means the section of the page between
393 definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included)
394 is compressed with the compression_codec.
395 If missing it is considered compressed
396 "
397  :initarg :is-compressed :initform nil :type (or null boolean))
398  (statistics :documentation
399  "Optional statistics for the data in this page *
400 "
401  :initarg :statistics :initform nil :type
402  (or null parquet-statistics)))
403  (:documentation
404  "New page format allowing reading levels without decompressing the data
405 Repetition and definition levels are uncompressed
406 The remaining section containing the data is compressed if is_compressed is true
407 
408 "))
409 (defclass parquet-split-block-algorithm (parquet-object) nil
410  (:documentation "Block-based algorithm type annotation. *
411 "))
412 (defclass parquet-bloom-filter-algorithm (parquet-object)
413  ((block :documentation
414  "Block-based Bloom filter. *
415 "
416  :initarg
417  :block
418  :initform
419  nil
420  :type
421  (or null parquet-split-block-algorithm)))
422  (:documentation "The algorithm used in Bloom filter. *
423 "))
424 (defclass parquet-xx-hash (parquet-object) nil
425  (:documentation
426  "Hash strategy type annotation. xxHash is an extremely fast non-cryptographic hash
427 algorithm. It uses 64 bits version of xxHash.
428 
429 "))
430 (defclass parquet-bloom-filter-hash (parquet-object)
431  ((xxhash :documentation "xxHash Strategy. *
432 "
433  :initarg :xxhash :initform nil :type (or null parquet-xx-hash)))
434  (:documentation
435  "The hash function used in Bloom filter. This function takes the hash of a column value
436 using plain encoding.
437 
438 "))
439 (defclass parquet-uncompressed (parquet-object) nil
440  (:documentation "The compression used in the Bloom filter.
441 
442 "))
443 (defclass parquet-bloom-filter-compression (parquet-object)
444  ((uncompressed :initarg :uncompressed :initform nil :type
445  (or null parquet-uncompressed))))
446 (defclass parquet-bloom-filter-header (parquet-object)
447  ((numbytes :documentation "The size of bitset in bytes *
448 "
449  :initarg :numbytes :type (signed-byte 32))
450  (algorithm :documentation "The algorithm for setting bits. *
451 "
452  :initarg :algorithm :type parquet-bloom-filter-algorithm)
453  (hash :documentation "The hash function used for Bloom filter. *
454 "
455  :initarg :hash :type parquet-bloom-filter-hash)
456  (compression :documentation
457  "The compression used in the Bloom filter *
458 "
459  :initarg :compression :type parquet-bloom-filter-compression))
460  (:documentation
461  "Bloom filter header is stored at beginning of Bloom filter data of each column
462 and followed by its bitset.
463 
464 "))
465 (defclass parquet-page-header (parquet-object)
466  ((type :documentation
467  "the type of the page: indicates which of the *_header fields is set *
468 "
469  :initarg :type :type parquet-page-type)
470  (uncompressed-page-size :documentation
471  "Uncompressed page size in bytes (not including this header) *
472 "
473  :initarg :uncompressed-page-size :type (signed-byte 32))
474  (compressed-page-size :documentation
475  "Compressed (and potentially encrypted) page size in bytes, not including this header *
476 "
477  :initarg :compressed-page-size :type (signed-byte 32))
478  (crc :documentation
479  "The 32-bit CRC checksum for the page, to be be calculated as follows:
480 
481 - The standard CRC32 algorithm is used (with polynomial 0x04C11DB7,
482  the same as in e.g. GZip).
483 - All page types can have a CRC (v1 and v2 data pages, dictionary pages,
484  etc.).
485 - The CRC is computed on the serialization binary representation of the page
486  (as written to disk), excluding the page header. For example, for v1
487  data pages, the CRC is computed on the concatenation of repetition levels,
488  definition levels and column values (optionally compressed, optionally
489  encrypted).
490 - The CRC computation therefore takes place after any compression
491  and encryption steps, if any.
492 
493 If enabled, this allows for disabling checksumming in HDFS if only a few
494 pages need to be read.
495 "
496  :initarg :crc :initform nil :type (or null (signed-byte 32)))
497  (data-page-header :initarg :data-page-header :initform nil :type
498  (or null parquet-data-page-header))
499  (index-page-header :initarg :index-page-header :initform nil :type
500  (or null parquet-index-page-header))
501  (dictionary-page-header :initarg :dictionary-page-header :initform
502  nil :type (or null parquet-dictionary-page-header))
503  (data-page-header-v2 :initarg :data-page-header-v2 :initform nil
504  :type (or null parquet-data-page-header-v2))))
505 (defclass parquet-key-value (parquet-object)
506  ((key :initarg :key :type string)
507  (value :initarg :value :initform nil :type (or null string)))
508  (:documentation "Wrapper struct to store key values
509 "))
510 (defclass parquet-sorting-column (parquet-object)
511  ((column-idx :documentation
512  "The ordinal position of the column (in this row group) *
513 "
514  :initarg :column-idx :type (signed-byte 32))
515  (descending :documentation
516  "If true, indicates this column is sorted in descending order. *
517 "
518  :initarg :descending :type boolean)
519  (nulls-first :documentation
520  "If true, nulls will come before non-null values, otherwise,
521 nulls go at the end.
522 "
523  :initarg :nulls-first :type boolean))
524  (:documentation "Sort order within a RowGroup of a leaf column
525 "))
526 (defclass parquet-page-encoding-stats (parquet-object)
527  ((page-type :documentation "the page type (data\\dic\\...) *
528 "
529  :initarg :page-type :type parquet-page-type)
530  (encoding :documentation "encoding of the page *
531 "
532  :initarg :encoding :type parquet-encoding)
533  (count :documentation
534  "number of pages of this type with this encoding *
535 "
536  :initarg :count :type (signed-byte 32)))
537  (:documentation "statistics of a given page type and encoding
538 "))
539 (defclass parquet-column-meta-data (parquet-object)
540  ((type :documentation "Type of this column *
541 "
542  :initarg :type :type parquet-type)
543  (encodings :documentation
544  "Set of all encodings used for this column. The purpose is to validate
545 whether we can decode those pages. *
546 "
547  :initarg :encodings :type (vector parquet-encoding))
548  (path-in-schema :documentation "Path in schema *
549 "
550  :initarg :path-in-schema :type (vector string))
551  (codec :documentation "Compression codec *
552 "
553  :initarg :codec :type parquet-compression-codec)
554  (num-values :documentation "Number of values in this column *
555 "
556  :initarg :num-values :type (signed-byte 64))
557  (total-uncompressed-size :documentation
558  "total byte size of all uncompressed pages in this column chunk (including the headers) *
559 "
560  :initarg :total-uncompressed-size :type (signed-byte 64))
561  (total-compressed-size :documentation
562  "total byte size of all compressed, and potentially encrypted, pages
563 in this column chunk (including the headers) *
564 "
565  :initarg :total-compressed-size :type (signed-byte 64))
566  (key-value-metadata :documentation "Optional key\\value metadata *
567 "
568  :initarg :key-value-metadata :initform nil :type
569  (or null (vector parquet-key-value)))
570  (data-page-offset :documentation
571  "Byte offset from beginning of file to first data page *
572 "
573  :initarg :data-page-offset :type (signed-byte 64))
574  (index-page-offset :documentation
575  "Byte offset from beginning of file to root index page *
576 "
577  :initarg :index-page-offset :initform nil :type
578  (or null (signed-byte 64)))
579  (dictionary-page-offset :documentation
580  "Byte offset from the beginning of file to first (only) dictionary page *
581 "
582  :initarg :dictionary-page-offset :initform nil :type
583  (or null (signed-byte 64)))
584  (statistics :documentation "optional statistics for this column chunk
585 "
586  :initarg :statistics :initform nil :type
587  (or null parquet-statistics))
588  (encoding-stats :documentation
589  "Set of all encodings used for pages in this column chunk.
590 This information can be used to determine if all data pages are
591 dictionary encoded for example *
592 "
593  :initarg :encoding-stats :initform nil :type
594  (or null (vector parquet-page-encoding-stats)))
595  (bloom-filter-offset :documentation
596  "Byte offset from beginning of file to Bloom filter data. *
597 "
598  :initarg :bloom-filter-offset :initform nil :type
599  (or null (signed-byte 64)))
600  (bloom-filter-length :documentation
601  "Size of Bloom filter data including the serialized header, in bytes.
602 Added in 2.10 so readers may not read this field from old files and
603 it can be obtained after the BloomFilterHeader has been deserialized.
604 Writers should write this field so readers can read the bloom filter
605 in a single I\\O.
606 "
607  :initarg :bloom-filter-length :initform nil :type
608  (or null (signed-byte 32)))
609  (size-statistics :documentation
610  "Optional statistics to help estimate total memory when converted to in-memory
611 representations. The histograms contained in these statistics can
612 also be useful in some cases for more fine-grained nullability\\list length
613 filter pushdown.
614 "
615  :initarg :size-statistics :initform nil :type
616  (or null parquet-size-statistics)))
617  (:documentation "Description for column metadata
618 "))
619 (defclass parquet-encryption-with-footer-key (parquet-object) nil)
620 (defclass parquet-encryption-with-column-key (parquet-object)
621  ((path-in-schema :documentation "Column path in schema *
622 "
623  :initarg :path-in-schema :type (vector string))
624  (key-metadata :documentation
625  "Retrieval metadata of column encryption key *
626 "
627  :initarg :key-metadata :initform nil :type (or null octet-vector))))
628 (defclass parquet-column-crypto-meta-data (parquet-object)
629  ((encryption-with-footer-key :initarg :encryption-with-footer-key
630  :initform nil :type (or null parquet-encryption-with-footer-key))
631  (encryption-with-column-key :initarg :encryption-with-column-key
632  :initform nil :type (or null parquet-encryption-with-column-key))))
633 (defclass parquet-column-chunk (parquet-object)
634  ((file-path :documentation
635  "File where column data is stored. If not set, assumed to be same file as
636 metadata. This path is relative to the current file.
637 
638 "
639  :initarg :file-path :initform nil :type (or null string))
640  (file-offset :documentation
641  "Deprecated: Byte offset in file_path to the ColumnMetaData
642 
643 Past use of this field has been inconsistent, with some implementations
644 using it to point to the ColumnMetaData and some using it to point to
645 the first page in the column chunk. In many cases, the ColumnMetaData at this
646 location is wrong. This field is now deprecated and should not be used.
647 Writers should set this field to 0 if no ColumnMetaData has been written outside
648 the footer.
649 "
650  :initarg :file-offset :type (signed-byte 64))
651  (meta-data :documentation
652  "Column metadata for this chunk. Some writers may also replicate this at the
653 location pointed to by file_path\\file_offset.
654 Note: while marked as optional, this field is in fact required by most major
655 Parquet implementations. As such, writers MUST populate this field.
656 
657 "
658  :initarg :meta-data :initform nil :type
659  (or null parquet-column-meta-data))
660  (offset-index-offset :documentation
661  "File offset of ColumnChunk's OffsetIndex *
662 "
663  :initarg :offset-index-offset :initform nil :type
664  (or null (signed-byte 64)))
665  (offset-index-length :documentation
666  "Size of ColumnChunk's OffsetIndex, in bytes *
667 "
668  :initarg :offset-index-length :initform nil :type
669  (or null (signed-byte 32)))
670  (column-index-offset :documentation
671  "File offset of ColumnChunk's ColumnIndex *
672 "
673  :initarg :column-index-offset :initform nil :type
674  (or null (signed-byte 64)))
675  (column-index-length :documentation
676  "Size of ColumnChunk's ColumnIndex, in bytes *
677 "
678  :initarg :column-index-length :initform nil :type
679  (or null (signed-byte 32)))
680  (crypto-metadata :documentation
681  "Crypto metadata of encrypted columns *
682 "
683  :initarg :crypto-metadata :initform nil :type
684  (or null parquet-column-crypto-meta-data))
685  (encrypted-column-metadata :documentation
686  "Encrypted column metadata for this chunk *
687 "
688  :initarg :encrypted-column-metadata :initform nil :type
689  (or null octet-vector))))
690 (defclass parquet-row-group (parquet-object)
691  ((columns :documentation
692  "Metadata for each column chunk in this row group.
693 This list must have the same order as the SchemaElement list in FileMetaData.
694 
695 "
696  :initarg :columns :type (vector parquet-column-chunk))
697  (total-byte-size :documentation
698  "Total byte size of all the uncompressed column data in this row group *
699 "
700  :initarg :total-byte-size :type (signed-byte 64))
701  (num-rows :documentation "Number of rows in this row group *
702 "
703  :initarg :num-rows :type (signed-byte 64))
704  (sorting-columns :documentation
705  "If set, specifies a sort ordering of the rows in this RowGroup.
706 The sorting columns can be a subset of all the columns.
707 "
708  :initarg :sorting-columns :initform nil :type
709  (or null (vector parquet-sorting-column)))
710  (file-offset :documentation
711  "Byte offset from beginning of file to first page (data or dictionary)
712 in this row group *
713 "
714  :initarg :file-offset :initform nil :type
715  (or null (signed-byte 64)))
716  (total-compressed-size :documentation
717  "Total byte size of all compressed (and potentially encrypted) column data
718 in this row group *
719 "
720  :initarg :total-compressed-size :initform nil :type
721  (or null (signed-byte 64)))
722  (ordinal :documentation "Row group ordinal in the file *
723 "
724  :initarg :ordinal :initform nil :type (or null (signed-byte 16)))))
725 (defclass parquet-type-defined-order (parquet-object) nil
726  (:documentation
727  "Empty struct to signal the order defined by the physical or logical type
728 "))
729 (defclass parquet-column-order (parquet-object)
730  ((type-order :documentation "The sort orders for logical types are:
731  UTF8 - unsigned byte-wise comparison
732  INT8 - signed comparison
733  INT16 - signed comparison
734  INT32 - signed comparison
735  INT64 - signed comparison
736  UINT8 - unsigned comparison
737  UINT16 - unsigned comparison
738  UINT32 - unsigned comparison
739  UINT64 - unsigned comparison
740  DECIMAL - signed comparison of the represented value
741  DATE - signed comparison
742  TIME_MILLIS - signed comparison
743  TIME_MICROS - signed comparison
744  TIMESTAMP_MILLIS - signed comparison
745  TIMESTAMP_MICROS - signed comparison
746  INTERVAL - undefined
747  JSON - unsigned byte-wise comparison
748  BSON - unsigned byte-wise comparison
749  ENUM - unsigned byte-wise comparison
750  LIST - undefined
751  MAP - undefined
752 
753 In the absence of logical types, the sort order is determined by the physical type:
754  BOOLEAN - false, true
755  INT32 - signed comparison
756  INT64 - signed comparison
757  INT96 (only used for legacy timestamps) - undefined
758  FLOAT - signed comparison of the represented value (*)
759  DOUBLE - signed comparison of the represented value (*)
760  BYTE_ARRAY - unsigned byte-wise comparison
761  FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison
762 
763 (*) Because the sorting order is not specified properly for floating
764  point values (relations vs. total ordering) the following
765  compatibility rules should be applied when reading statistics:
766  - If the min is a NaN, it should be ignored.
767  - If the max is a NaN, it should be ignored.
768  - If the min is +0, the row group may contain -0 values as well.
769  - If the max is -0, the row group may contain +0 values as well.
770  - When looking for NaN values, min and max should be ignored.
771 
772  When writing statistics the following rules should be followed:
773  - NaNs should not be written to min or max statistics fields.
774  - If the computed max value is zero (whether negative or positive),
775  `+0.0` should be written into the max statistics field.
776  - If the computed min value is zero (whether negative or positive),
777  `-0.0` should be written into the min statistics field.
778 "
779  :initarg :type-order :initform nil :type
780  (or null parquet-type-defined-order)))
781  (:documentation
782  "Union to specify the order used for the min_value and max_value fields for a
783 column. This union takes the role of an enhanced enum that allows rich
784 elements (which will be needed for a collation-based ordering in the future).
785 
786 Possible values are:
787 * TypeDefinedOrder - the column uses the order defined by its logical or
788  physical type (if there is no logical type).
789 
790 If the reader does not support the value of this union, min and max stats
791 for this column should be ignored.
792 "))
793 (defclass parquet-page-location (parquet-object)
794  ((offset :documentation "Offset of the page in the file *
795 "
796  :initarg :offset :type (signed-byte 64))
797  (compressed-page-size :documentation
798  "Size of the page, including header. Sum of compressed_page_size and header
799 length
800 "
801  :initarg :compressed-page-size :type (signed-byte 32))
802  (first-row-index :documentation
803  "Index within the RowGroup of the first row of the page. When an
804 OffsetIndex is present, pages must begin on row boundaries
805 (repetition_level = 0).
806 "
807  :initarg :first-row-index :type (signed-byte 64))))
808 (defclass parquet-offset-index (parquet-object)
809  ((page-locations :documentation
810  "PageLocations, ordered by increasing PageLocation.offset. It is required
811 that page_locations[i].first_row_index < page_locations[i+1].first_row_index.
812 "
813  :initarg :page-locations :type (vector parquet-page-location))
814  (unencoded-byte-array-data-bytes :documentation
815  "Unencoded\\uncompressed size for BYTE_ARRAY types.
816 
817 See documention for unencoded_byte_array_data_bytes in SizeStatistics for
818 more details on this field.
819 "
820  :initarg :unencoded-byte-array-data-bytes :initform nil :type
821  (or null (vector (signed-byte 64)))))
822  (:documentation "Optional offsets for each data page in a ColumnChunk.
823 
824 Forms part of the page index, along with ColumnIndex.
825 
826 OffsetIndex may be present even if ColumnIndex is not.
827 "))
828 (defclass parquet-column-index (parquet-object)
829  ((null-pages :documentation
830  "A list of Boolean values to determine the validity of the corresponding
831 min and max values. If true, a page contains only null values, and writers
832 have to set the corresponding entries in min_values and max_values to
833 byte[0], so that all lists have the same length. If false, the
834 corresponding entries in min_values and max_values must be valid.
835 "
836  :initarg :null-pages :type (vector boolean))
837  (min-values :documentation
838  "Two lists containing lower and upper bounds for the values of each page
839 determined by the ColumnOrder of the column. These may be the actual
840 minimum and maximum values found on a page, but can also be (more compact)
841 values that do not exist on a page. For example, instead of storing \\\\Blart
842 Versenwald III\\, a writer may set min_values[i]=\\B\\, max_values[i]=\\C\\.
843 Such more compact values must still be valid values within the column's
844 logical type. Readers must make sure that list entries are populated before
845 using them by inspecting null_pages.
846 "
847  :initarg :min-values :type (vector octet-vector))
848  (max-values :initarg :max-values :type (vector octet-vector))
849  (boundary-order :documentation
850  "Stores whether both min_values and max_values are ordered and if so, in
851 which direction. This allows readers to perform binary searches in both
852 lists. Readers cannot assume that max_values[i] <= min_values[i+1], even
853 if the lists are ordered.
854 "
855  :initarg :boundary-order :type parquet-boundary-order)
856  (null-counts :documentation
857  "A list containing the number of null values for each page *
858 "
859  :initarg :null-counts :initform nil :type
860  (or null (vector (signed-byte 64))))
861  (repetition-level-histograms :documentation
862  "Contains repetition level histograms for each page
863 concatenated together. The repetition_level_histogram field on
864 SizeStatistics contains more details.
865 
866 When present the length should always be (number of pages *
867 (max_repetition_level + 1)) elements.
868 
869 Element 0 is the first element of the histogram for the first page.
870 Element (max_repetition_level + 1) is the first element of the histogram
871 for the second page.
872 
873 "
874  :initarg :repetition-level-histograms :initform nil :type
875  (or null (vector (signed-byte 64))))
876  (definition-level-histograms :documentation
877  "Same as repetition_level_histograms except for definitions levels.
878 
879 "
880  :initarg :definition-level-histograms :initform nil :type
881  (or null (vector (signed-byte 64)))))
882  (:documentation
883  "Optional statistics for each data page in a ColumnChunk.
884 
885 Forms part the page index, along with OffsetIndex.
886 
887 If this structure is present, OffsetIndex must also be present.
888 
889 For each field in this structure, <field>[i] refers to the page at
890 OffsetIndex.page_locations[i]
891 "))
892 (defclass parquet-aes-gcm-v1 (parquet-object)
893  ((aad-prefix :documentation "AAD prefix *
894 "
895  :initarg :aad-prefix :initform nil :type (or null octet-vector))
896  (aad-file-unique :documentation
897  "Unique file identifier part of AAD suffix *
898 "
899  :initarg :aad-file-unique :initform nil :type
900  (or null octet-vector))
901  (supply-aad-prefix :documentation
902  "In files encrypted with AAD prefix without storing it,
903 readers must supply the prefix *
904 "
905  :initarg :supply-aad-prefix :initform nil :type (or null boolean))))
906 (defclass parquet-aes-gcm-ctr-v1 (parquet-object)
907  ((aad-prefix :documentation "AAD prefix *
908 "
909  :initarg :aad-prefix :initform nil :type (or null octet-vector))
910  (aad-file-unique :documentation
911  "Unique file identifier part of AAD suffix *
912 "
913  :initarg :aad-file-unique :initform nil :type
914  (or null octet-vector))
915  (supply-aad-prefix :documentation
916  "In files encrypted with AAD prefix without storing it,
917 readers must supply the prefix *
918 "
919  :initarg :supply-aad-prefix :initform nil :type (or null boolean))))
920 (defclass parquet-encryption-algorithm (parquet-object)
921  ((aes-gcm-v1 :initarg :aes-gcm-v1 :initform nil :type
922  (or null parquet-aes-gcm-v1))
923  (aes-gcm-ctr-v1 :initarg :aes-gcm-ctr-v1 :initform nil :type
924  (or null parquet-aes-gcm-ctr-v1))))
925 (defclass parquet-file-meta-data (parquet-object)
926  ((version :documentation "Version of this file *
927 "
928  :initarg :version :type (signed-byte 32))
929  (schema :documentation
930  "Parquet schema for this file. This schema contains metadata for all the columns.
931 The schema is represented as a tree with a single root. The nodes of the tree
932 are flattened to a list by doing a depth-first traversal.
933 The column metadata contains the path in the schema for that column which can be
934 used to map columns to nodes in the schema.
935 The first element is the root *
936 "
937  :initarg :schema :type (vector parquet-schema-element))
938  (num-rows :documentation "Number of rows in this file *
939 "
940  :initarg :num-rows :type (signed-byte 64))
941  (row-groups :documentation "Row groups in this file *
942 "
943  :initarg :row-groups :type (vector parquet-row-group))
944  (key-value-metadata :documentation "Optional key\\value metadata *
945 "
946  :initarg :key-value-metadata :initform nil :type
947  (or null (vector parquet-key-value)))
948  (created-by :documentation
949  "String for application that wrote this file. This should be in the format
950 <Application> version <App Version> (build <App Build Hash>).
951 e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55)
952 
953 "
954  :initarg :created-by :initform nil :type (or null string))
955  (column-orders :documentation
956  "Sort order used for the min_value and max_value fields in the Statistics
957 objects and the min_values and max_values fields in the ColumnIndex
958 objects of each column in this file. Sort orders are listed in the order
959 matching the columns in the schema. The indexes are not necessary the same
960 though, because only leaf nodes of the schema are represented in the list
961 of sort orders.
962 
963 Without column_orders, the meaning of the min_value and max_value fields
964 in the Statistics object and the ColumnIndex object is undefined. To ensure
965 well-defined behaviour, if these fields are written to a Parquet file,
966 column_orders must be written as well.
967 
968 The obsolete min and max fields in the Statistics object are always sorted
969 by signed comparison regardless of column_orders.
970 "
971  :initarg :column-orders :initform nil :type
972  (or null (vector parquet-column-order)))
973  (encryption-algorithm :documentation
974  "Encryption algorithm. This field is set only in encrypted files
975 with plaintext footer. Files with encrypted footer store algorithm id
976 in FileCryptoMetaData structure.
977 "
978  :initarg :encryption-algorithm :initform nil :type
979  (or null parquet-encryption-algorithm))
980  (footer-signing-key-metadata :documentation
981  "Retrieval metadata of key used for signing the footer.
982 Used only in encrypted files with plaintext footer.
983 "
984  :initarg :footer-signing-key-metadata :initform nil :type
985  (or null octet-vector)))
986  (:documentation "Description for file metadata
987 "))
988 (defclass parquet-file-crypto-meta-data (parquet-object)
989  ((encryption-algorithm :documentation
990  "Encryption algorithm. This field is only used for files
991 with encrypted footer. Files with plaintext footer store algorithm id
992 inside footer (FileMetaData structure).
993 "
994  :initarg :encryption-algorithm :type parquet-encryption-algorithm)
995  (key-metadata :documentation
996  "Retrieval metadata of key used for encryption of footer,
997 and (possibly) columns *
998 "
999  :initarg :key-metadata :initform nil :type (or null octet-vector)))
1000  (:documentation "Crypto metadata for files with encrypted footer *
1001 "))