Mercurial > core / lisp/lib/dat/parquet/thrift.lisp
changeset 637: |
b88bf15f60d0 |
parent: |
849f72b72b41
|
child: |
642b3b82b20d |
author: |
Richard Westhaver <ellis@rwest.io> |
date: |
Wed, 04 Sep 2024 22:02:21 -0400 |
permissions: |
-rw-r--r-- |
description: |
parquet tweaks, import ox-man |
1 ;;; /home/ellis/comp/core/lisp/lib/dat/parquet/thrift.lisp --- Parquet Thrift Definitions -*- buffer-read-only:t -*- 3 ;; input = /home/ellis/comp/core/.stash/parquet.json 5 ;; This file was generated automatically by 6 ;; DAT/PARQUET/GEN:PARSE-PARQUET-THRIFT-DEFINITIONS 11 (in-package :dat/parquet) 13 (defun parquet-json-types () 16 (keywordicate (snakecase-name-to-lisp-name (json-getf x "name")))) 17 (parquet-json-enum-getf "Type"))) 18 (defparameter *parquet-types* (parquet-json-types)) 19 (defun parquet-json-converted-types () 22 (keywordicate (snakecase-name-to-lisp-name (json-getf x "name")))) 23 (parquet-json-enum-getf "ConvertedType"))) 24 (defparameter *parquet-converted-types* (parquet-json-converted-types)) 25 (defun parquet-json-field-repetition-types () 28 (keywordicate (snakecase-name-to-lisp-name (json-getf x "name")))) 29 (parquet-json-enum-getf "FieldRepetitionType"))) 30 (defparameter *parquet-field-repetition-types* 31 (parquet-json-field-repetition-types)) 32 (defun parquet-json-encodings () 35 (keywordicate (snakecase-name-to-lisp-name (json-getf x "name")))) 36 (parquet-json-enum-getf "Encoding"))) 37 (defparameter *parquet-encodings* (parquet-json-encodings)) 38 (defun parquet-json-compression-codecs () 41 (keywordicate (snakecase-name-to-lisp-name (json-getf x "name")))) 42 (parquet-json-enum-getf "CompressionCodec"))) 43 (defparameter *parquet-compression-codecs* (parquet-json-compression-codecs)) 44 (defun parquet-json-page-types () 47 (keywordicate (snakecase-name-to-lisp-name (json-getf x "name")))) 48 (parquet-json-enum-getf "PageType"))) 49 (defparameter *parquet-page-types* (parquet-json-page-types)) 50 (defun parquet-json-boundary-orders () 53 (keywordicate (snakecase-name-to-lisp-name (json-getf x "name")))) 54 (parquet-json-enum-getf "BoundaryOrder"))) 55 (defparameter *parquet-boundary-orders* (parquet-json-boundary-orders)) 56 (eval-when (:compile-toplevel :load-toplevel :execute) 57 (sb-impl::%deftype 'parquet-boolean 58 (sb-impl::constant-type-expander 'parquet-boolean 60 (sb-c:source-location))) 61 (eval-when (:compile-toplevel :load-toplevel :execute) 62 (sb-impl::%deftype 'parquet-int32 63 (sb-impl::constant-type-expander 'parquet-int32 66 (sb-c:source-location))) 67 (eval-when (:compile-toplevel :load-toplevel :execute) 68 (sb-impl::%deftype 'parquet-int64 69 (sb-impl::constant-type-expander 'parquet-int64 72 (sb-c:source-location))) 73 (eval-when (:compile-toplevel :load-toplevel :execute) 74 (sb-impl::%deftype 'parquet-int96 75 (sb-impl::constant-type-expander 'parquet-int96 78 (sb-c:source-location))) 79 (eval-when (:compile-toplevel :load-toplevel :execute) 80 (sb-impl::%deftype 'parquet-float 81 (sb-impl::constant-type-expander 'parquet-float 83 (sb-c:source-location))) 84 (eval-when (:compile-toplevel :load-toplevel :execute) 85 (sb-impl::%deftype 'parquet-double 86 (sb-impl::constant-type-expander 'parquet-double 87 (progn 'double-float)) 88 (sb-c:source-location))) 89 (eval-when (:compile-toplevel :load-toplevel :execute) 90 (sb-impl::%deftype 'parquet-byte-array 91 (sb-int:named-lambda (sb-impl::type-expander 94 (declare (sb-c::lambda-list (&optional size))) 95 (sb-int:named-ds-bind (:macro parquet-byte-array 99 (declare (sb-c::constant-value size)) 100 (block parquet-byte-array `(octet-vector ,size)))) 102 (eval-when (:compile-toplevel :load-toplevel :execute) 103 (sb-impl::%deftype 'parquet-fixed-len-byte-array 104 (sb-int:named-lambda (sb-impl::type-expander 105 parquet-fixed-len-byte-array) 107 (declare (sb-c::lambda-list (size))) 108 (sb-int:named-ds-bind (:macro 109 parquet-fixed-len-byte-array 113 (declare (sb-c::constant-value size)) 114 (block parquet-fixed-len-byte-array 115 `(octet-vector ,size)))) 117 (defclass parquet-size-statistics (dat/parquet:parquet-object) 118 ((unencoded-byte-array-data-bytes :documentation 119 "The number of physical bytes stored for BYTE_ARRAY data values assuming 120 no encoding. This is exclusive of the bytes needed to store the length of 121 each byte array. In other words, this field is equivalent to the `(size 122 of PLAIN-ENCODING the byte array values) - (4 bytes * number of values 123 written)`. To determine unencoded sizes of other types readers can use 124 schema information multiplied by the number of non-null and null values. 125 The number of null\\non-null values can be inferred from the histograms 128 For example, if a column chunk is dictionary-encoded with dictionary 129 [\\a\\, \\bc\\, \\cde\\], and a data page contains the indices [0, 0, 1, 2], 130 then this value for that data page should be 7 (1 + 1 + 2 + 3). 132 This field should only be set for types that use BYTE_ARRAY as their 135 :initarg :unencoded-byte-array-data-bytes :initform nil :type 136 (or null (signed-byte 64))) 137 (repetition-level-histogram :documentation 138 "When present, there is expected to be one element corresponding to each 139 repetition (i.e. size=max repetition_level+1) where each element 140 represents the number of times the repetition level was observed in the 143 This field may be omitted if max_repetition_level is 0 without loss 147 :initarg :repetition-level-histogram :initform nil :type 148 (or null (vector (signed-byte 64)))) 149 (definition-level-histogram :documentation 150 "Same as repetition_level_histogram except for definition levels. 152 This field may be omitted if max_definition_level is 0 or 1 without 156 :initarg :definition-level-histogram :initform nil :type 157 (or null (vector (signed-byte 64))))) 159 "A structure for capturing metadata for estimating the unencoded, 160 uncompressed size of data written. This is useful for readers to estimate 161 how much memory is needed to reconstruct data in their memory model and for 162 fine grained filter pushdown on nested structures (the histograms contained 163 in this structure can help determine the number of nulls at a particular 164 nesting level and maximum length of lists). 166 (defclass parquet-statistics (dat/parquet:parquet-object) 168 "DEPRECATED: min and max value of the column. Use min_value and max_value. 170 Values are encoded using PLAIN encoding, except that variable-length byte 171 arrays do not include a length prefix. 173 These fields encode min and max values determined by signed comparison 174 only. New files should use the correct order for a column's logical type 175 and store the values in the min_value and max_value fields. 177 To support older readers, these may be set when the column order is 180 :initarg :max :initform nil :type (or null octet-vector)) 181 (min :initarg :min :initform nil :type (or null octet-vector)) 182 (null-count :documentation "count of null value in the column 184 :initarg :null-count :initform nil :type 185 (or null (signed-byte 64))) 186 (distinct-count :documentation "count of distinct values occurring 188 :initarg :distinct-count :initform nil :type 189 (or null (signed-byte 64))) 190 (max-value :documentation 191 "Lower and upper bound values for the column, determined by its ColumnOrder. 193 These may be the actual minimum and maximum values found on a page or column 194 chunk, but can also be (more compact) values that do not exist on a page or 195 column chunk. For example, instead of storing \\Blart Versenwald III\\, a writer 196 may set min_value=\\B\\, max_value=\\C\\. Such more compact values must still be 197 valid values within the column's logical type. 199 Values are encoded using PLAIN encoding, except that variable-length byte 200 arrays do not include a length prefix. 202 :initarg :max-value :initform nil :type (or null octet-vector)) 203 (min-value :initarg :min-value :initform nil :type 204 (or null octet-vector)) 205 (is-max-value-exact :documentation 206 "If true, max_value is the actual maximum value for a column 208 :initarg :is-max-value-exact :initform nil :type (or null boolean)) 209 (is-min-value-exact :documentation 210 "If true, min_value is the actual minimum value for a column 212 :initarg :is-min-value-exact :initform nil :type 214 (:documentation "Statistics per row group and per page 215 All fields are optional. 217 (defclass parquet-string-type (dat/parquet:parquet-object) nil 218 (:documentation "Empty structs to use as logical type annotations 220 (defclass parquet-uuid-type (dat/parquet:parquet-object) nil) 221 (defclass parquet-map-type (dat/parquet:parquet-object) nil) 222 (defclass parquet-list-type (dat/parquet:parquet-object) nil) 223 (defclass parquet-enum-type (dat/parquet:parquet-object) nil) 224 (defclass parquet-date-type (dat/parquet:parquet-object) nil) 225 (defclass parquet-float16-type (dat/parquet:parquet-object) nil) 226 (defclass parquet-null-type (dat/parquet:parquet-object) nil 228 "Logical type to annotate a column that is always null. 230 Sometimes when discovering the schema of existing data, values are always 231 null and the physical type can't be determined. This annotation signals 232 the case where the physical type was guessed from all null values. 234 (defclass parquet-decimal-type (dat/parquet:parquet-object) 235 ((scale :initarg :scale :type (signed-byte 32)) 236 (precision :initarg :precision :type (signed-byte 32))) 237 (:documentation "Decimal logical type annotation 239 Scale must be zero or a positive integer less than or equal to the precision. 240 Precision must be a non-zero positive integer. 242 To maintain forward-compatibility in v1, implementations using this logical 243 type must also set scale and precision on the annotated SchemaElement. 245 Allowed for physical types: INT32, INT64, FIXED_LEN_BYTE_ARRAY, and BYTE_ARRAY. 247 (defclass parquet-milli-seconds (dat/parquet:parquet-object) nil 248 (:documentation "Time units for logical types 250 (defclass parquet-micro-seconds (dat/parquet:parquet-object) nil) 251 (defclass parquet-nano-seconds (dat/parquet:parquet-object) nil) 252 (defclass parquet-time-unit (dat/parquet:parquet-object) 253 ((millis :initarg :millis :initform nil :type 254 (or null parquet-milli-seconds)) 255 (micros :initarg :micros :initform nil :type 256 (or null parquet-micro-seconds)) 257 (nanos :initarg :nanos :initform nil :type 258 (or null parquet-nano-seconds)))) 259 (defclass parquet-timestamp-type (dat/parquet:parquet-object) 260 ((isadjustedtoutc :initarg :isadjustedtoutc :type boolean) 261 (unit :initarg :unit :type parquet-time-unit)) 262 (:documentation "Timestamp logical type annotation 264 Allowed for physical types: INT64 266 (defclass parquet-time-type (dat/parquet:parquet-object) 267 ((isadjustedtoutc :initarg :isadjustedtoutc :type boolean) 268 (unit :initarg :unit :type parquet-time-unit)) 269 (:documentation "Time logical type annotation 271 Allowed for physical types: INT32 (millis), INT64 (micros, nanos) 273 (defclass parquet-int-type (dat/parquet:parquet-object) 274 ((bitwidth :initarg :bitwidth) 275 (issigned :initarg :issigned :type boolean)) 276 (:documentation "Integer logical type annotation 278 bitWidth must be 8, 16, 32, or 64. 280 Allowed for physical types: INT32, INT64 282 (defclass parquet-json-type (dat/parquet:parquet-object) nil 283 (:documentation "Embedded JSON logical type annotation 285 Allowed for physical types: BYTE_ARRAY 287 (defclass parquet-bson-type (dat/parquet:parquet-object) nil 288 (:documentation "Embedded BSON logical type annotation 290 Allowed for physical types: BYTE_ARRAY 292 (defclass parquet-logical-type (dat/parquet:parquet-object) 293 ((string :initarg :string :initform nil :type 294 (or null parquet-string-type)) 295 (map :initarg :map :initform nil :type (or null parquet-map-type)) 296 (list :initarg :list :initform nil :type 297 (or null parquet-list-type)) 298 (enum :initarg :enum :initform nil :type 299 (or null parquet-enum-type)) 300 (decimal :initarg :decimal :initform nil :type 301 (or null parquet-decimal-type)) 302 (date :initarg :date :initform nil :type 303 (or null parquet-date-type)) 310 (or null parquet-time-type)) 311 (timestamp :initarg :timestamp :initform nil :type 312 (or null parquet-timestamp-type)) 313 (integer :initarg :integer :initform nil :type 314 (or null parquet-int-type)) 315 (unknown :initarg :unknown :initform nil :type 316 (or null parquet-null-type)) 317 (json :initarg :json :initform nil :type 318 (or null parquet-json-type)) 319 (bson :initarg :bson :initform nil :type 320 (or null parquet-bson-type)) 321 (uuid :initarg :uuid :initform nil :type 322 (or null parquet-uuid-type)) 323 (float16 :initarg :float16 :initform nil :type 324 (or null parquet-float16-type))) 325 (:documentation "LogicalType annotations to replace ConvertedType. 327 To maintain compatibility, implementations using LogicalType for a 328 SchemaElement must also set the corresponding ConvertedType (if any) 329 from the following table. 331 (defclass parquet-schema-element (dat/parquet:parquet-object) 332 ((type :documentation 333 "Data type for this field. Not set if the current element is a non-leaf node 335 :initarg :type :initform nil :type (or null parquet-type)) 336 (type-length :documentation 337 "If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the values. 338 Otherwise, if specified, this is the maximum bit length to store any of the values. 339 (e.g. a low cardinality INT col could have this set to 3). Note that this is 340 in the schema, and therefore fixed for the entire file. 342 :initarg :type-length :initform nil :type 343 (or null (signed-byte 32))) 344 (repetition-type :documentation 345 "repetition of the field. The root of the schema does not have a repetition_type. 346 All other nodes must have one 348 :initarg :repetition-type :initform nil :type 349 (or null parquet-field-repetition-type)) 350 (name :documentation "Name of the field in the schema 352 :initarg :name :type string) 353 (num-children :documentation 354 "Nested fields. Since thrift does not support nested fields, 355 the nesting is flattened to a single list by a depth-first traversal. 356 The children count is used to construct the nested relationship. 357 This field is not set when the element is a primitive type 359 :initarg :num-children :initform nil :type 360 (or null (signed-byte 32))) 361 (converted-type :documentation 362 "DEPRECATED: When the schema is the result of a conversion from another model. 363 Used to record the original type to help with cross conversion. 365 This is superseded by logicalType. 367 :initarg :converted-type :initform nil :type 368 (or null parquet-converted-type)) 369 (scale :documentation 370 "DEPRECATED: Used when this column contains decimal data. 371 See the DECIMAL converted type for more details. 373 This is superseded by using the DecimalType annotation in logicalType. 375 :initarg :scale :initform nil :type (or null (signed-byte 32))) 376 (precision :initarg :precision :initform nil :type 377 (or null (signed-byte 32))) 378 (field-id :documentation 379 "When the original schema supports field ids, this will save the 380 original field id in the parquet schema 382 :initarg :field-id :initform nil :type (or null (signed-byte 32))) 383 (logicaltype :documentation "The logical type of this SchemaElement 385 LogicalType replaces ConvertedType, but ConvertedType is still required 386 for some logical types to ensure forward-compatibility in format v1. 388 :initarg :logicaltype :initform nil :type 389 (or null parquet-logical-type))) 390 (:documentation "Represents a element inside a schema definition. 391 - if it is a group (inner node) then type is undefined and num_children is defined 392 - if it is a primitive type (leaf) then type is defined and num_children is undefined 393 the nodes are listed in depth first traversal order. 395 (defclass parquet-data-page-header (dat/parquet:parquet-object) 396 ((num-values :documentation 397 "Number of values, including NULLs, in this data page. 399 If a OffsetIndex is present, a page must begin at a row 400 boundary (repetition_level = 0). Otherwise, pages may begin 401 within a row (repetition_level > 0). 404 :initarg :num-values :type (signed-byte 32)) 405 (encoding :documentation "Encoding used for this data page * 407 :initarg :encoding :type parquet-encoding) 408 (definition-level-encoding :documentation 409 "Encoding used for definition levels * 411 :initarg :definition-level-encoding :type parquet-encoding) 412 (repetition-level-encoding :documentation 413 "Encoding used for repetition levels * 415 :initarg :repetition-level-encoding :type parquet-encoding) 416 (statistics :documentation 417 "Optional statistics for the data in this page * 419 :initarg :statistics :initform nil :type 420 (or null parquet-statistics))) 421 (:documentation "Data page header 423 (defclass parquet-index-page-header (dat/parquet:parquet-object) nil) 424 (defclass parquet-dictionary-page-header (dat/parquet:parquet-object) 425 ((num-values :documentation "Number of values in the dictionary * 427 :initarg :num-values :type (signed-byte 32)) 428 (encoding :documentation "Encoding using this dictionary page * 430 :initarg :encoding :type parquet-encoding) 431 (is-sorted :documentation 432 "If true, the entries in the dictionary are sorted in ascending order * 434 :initarg :is-sorted :initform nil :type (or null boolean))) 436 "The dictionary page must be placed at the first position of the column chunk 437 if it is partly or completely dictionary encoded. At most one dictionary page 438 can be placed in a column chunk. 441 (defclass parquet-data-page-header-v2 (dat/parquet:parquet-object) 442 ((num-values :documentation 443 "Number of values, including NULLs, in this data page. * 445 :initarg :num-values :type (signed-byte 32)) 446 (num-nulls :documentation "Number of NULL values, in this data page. 447 Number of non-null = num_values - num_nulls which is also the number of values in the data section * 449 :initarg :num-nulls :type (signed-byte 32)) 450 (num-rows :documentation 451 "Number of rows in this data page. Every page must begin at a 452 row boundary (repetition_level = 0): rows must **not** be 453 split across page boundaries when using V2 data pages. 456 :initarg :num-rows :type (signed-byte 32)) 457 (encoding :documentation "Encoding used for data in this page * 459 :initarg :encoding :type parquet-encoding) 460 (definition-levels-byte-length :documentation 461 "Length of the definition levels 463 :initarg :definition-levels-byte-length :type (signed-byte 32)) 464 (repetition-levels-byte-length :documentation 465 "Length of the repetition levels 467 :initarg :repetition-levels-byte-length :type (signed-byte 32)) 468 (is-compressed :documentation "Whether the values are compressed. 469 Which means the section of the page between 470 definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) 471 is compressed with the compression_codec. 472 If missing it is considered compressed 474 :initarg :is-compressed :initform nil :type (or null boolean)) 475 (statistics :documentation 476 "Optional statistics for the data in this page * 478 :initarg :statistics :initform nil :type 479 (or null parquet-statistics))) 481 "New page format allowing reading levels without decompressing the data 482 Repetition and definition levels are uncompressed 483 The remaining section containing the data is compressed if is_compressed is true 486 (defclass parquet-split-block-algorithm (dat/parquet:parquet-object) nil 487 (:documentation "Block-based algorithm type annotation. * 489 (defclass parquet-bloom-filter-algorithm (dat/parquet:parquet-object) 490 ((block :documentation 491 "Block-based Bloom filter. * 498 (or null parquet-split-block-algorithm))) 499 (:documentation "The algorithm used in Bloom filter. * 501 (defclass parquet-xx-hash (dat/parquet:parquet-object) nil 503 "Hash strategy type annotation. xxHash is an extremely fast non-cryptographic hash 504 algorithm. It uses 64 bits version of xxHash. 507 (defclass parquet-bloom-filter-hash (dat/parquet:parquet-object) 508 ((xxhash :documentation "xxHash Strategy. * 510 :initarg :xxhash :initform nil :type (or null parquet-xx-hash))) 512 "The hash function used in Bloom filter. This function takes the hash of a column value 513 using plain encoding. 516 (defclass parquet-uncompressed (dat/parquet:parquet-object) nil 517 (:documentation "The compression used in the Bloom filter. 520 (defclass parquet-bloom-filter-compression (dat/parquet:parquet-object) 521 ((uncompressed :initarg :uncompressed :initform nil :type 522 (or null parquet-uncompressed)))) 523 (defclass parquet-bloom-filter-header (dat/parquet:parquet-object) 524 ((numbytes :documentation "The size of bitset in bytes * 526 :initarg :numbytes :type (signed-byte 32)) 527 (algorithm :documentation "The algorithm for setting bits. * 529 :initarg :algorithm :type parquet-bloom-filter-algorithm) 530 (hash :documentation "The hash function used for Bloom filter. * 532 :initarg :hash :type parquet-bloom-filter-hash) 533 (compression :documentation 534 "The compression used in the Bloom filter * 536 :initarg :compression :type parquet-bloom-filter-compression)) 538 "Bloom filter header is stored at beginning of Bloom filter data of each column 539 and followed by its bitset. 542 (defclass parquet-page-header (dat/parquet:parquet-object) 543 ((type :documentation 544 "the type of the page: indicates which of the *_header fields is set * 546 :initarg :type :type parquet-page-type) 547 (uncompressed-page-size :documentation 548 "Uncompressed page size in bytes (not including this header) * 550 :initarg :uncompressed-page-size :type (signed-byte 32)) 551 (compressed-page-size :documentation 552 "Compressed (and potentially encrypted) page size in bytes, not including this header * 554 :initarg :compressed-page-size :type (signed-byte 32)) 556 "The 32-bit CRC checksum for the page, to be be calculated as follows: 558 - The standard CRC32 algorithm is used (with polynomial 0x04C11DB7, 559 the same as in e.g. GZip). 560 - All page types can have a CRC (v1 and v2 data pages, dictionary pages, 562 - The CRC is computed on the serialization binary representation of the page 563 (as written to disk), excluding the page header. For example, for v1 564 data pages, the CRC is computed on the concatenation of repetition levels, 565 definition levels and column values (optionally compressed, optionally 567 - The CRC computation therefore takes place after any compression 568 and encryption steps, if any. 570 If enabled, this allows for disabling checksumming in HDFS if only a few 571 pages need to be read. 573 :initarg :crc :initform nil :type (or null (signed-byte 32))) 574 (data-page-header :initarg :data-page-header :initform nil :type 575 (or null parquet-data-page-header)) 576 (index-page-header :initarg :index-page-header :initform nil :type 577 (or null parquet-index-page-header)) 578 (dictionary-page-header :initarg :dictionary-page-header :initform 579 nil :type (or null parquet-dictionary-page-header)) 580 (data-page-header-v2 :initarg :data-page-header-v2 :initform nil 581 :type (or null parquet-data-page-header-v2)))) 582 (defclass parquet-key-value (dat/parquet:parquet-object) 583 ((key :initarg :key :type string) 584 (value :initarg :value :initform nil :type (or null string))) 585 (:documentation "Wrapper struct to store key values 587 (defclass parquet-sorting-column (dat/parquet:parquet-object) 588 ((column-idx :documentation 589 "The ordinal position of the column (in this row group) * 591 :initarg :column-idx :type (signed-byte 32)) 592 (descending :documentation 593 "If true, indicates this column is sorted in descending order. * 595 :initarg :descending :type boolean) 596 (nulls-first :documentation 597 "If true, nulls will come before non-null values, otherwise, 600 :initarg :nulls-first :type boolean)) 601 (:documentation "Sort order within a RowGroup of a leaf column 603 (defclass parquet-page-encoding-stats (dat/parquet:parquet-object) 604 ((page-type :documentation "the page type (data\\dic\\...) * 606 :initarg :page-type :type parquet-page-type) 607 (encoding :documentation "encoding of the page * 609 :initarg :encoding :type parquet-encoding) 610 (count :documentation 611 "number of pages of this type with this encoding * 613 :initarg :count :type (signed-byte 32))) 614 (:documentation "statistics of a given page type and encoding 616 (defclass parquet-column-meta-data (dat/parquet:parquet-object) 617 ((type :documentation "Type of this column * 619 :initarg :type :type parquet-type) 620 (encodings :documentation 621 "Set of all encodings used for this column. The purpose is to validate 622 whether we can decode those pages. * 624 :initarg :encodings :type (vector parquet-encoding)) 625 (path-in-schema :documentation "Path in schema * 627 :initarg :path-in-schema :type (vector string)) 628 (codec :documentation "Compression codec * 630 :initarg :codec :type parquet-compression-codec) 631 (num-values :documentation "Number of values in this column * 633 :initarg :num-values :type (signed-byte 64)) 634 (total-uncompressed-size :documentation 635 "total byte size of all uncompressed pages in this column chunk (including the headers) * 637 :initarg :total-uncompressed-size :type (signed-byte 64)) 638 (total-compressed-size :documentation 639 "total byte size of all compressed, and potentially encrypted, pages 640 in this column chunk (including the headers) * 642 :initarg :total-compressed-size :type (signed-byte 64)) 643 (key-value-metadata :documentation "Optional key\\value metadata * 645 :initarg :key-value-metadata :initform nil :type 646 (or null (vector parquet-key-value))) 647 (data-page-offset :documentation 648 "Byte offset from beginning of file to first data page * 650 :initarg :data-page-offset :type (signed-byte 64)) 651 (index-page-offset :documentation 652 "Byte offset from beginning of file to root index page * 654 :initarg :index-page-offset :initform nil :type 655 (or null (signed-byte 64))) 656 (dictionary-page-offset :documentation 657 "Byte offset from the beginning of file to first (only) dictionary page * 659 :initarg :dictionary-page-offset :initform nil :type 660 (or null (signed-byte 64))) 661 (statistics :documentation "optional statistics for this column chunk 663 :initarg :statistics :initform nil :type 664 (or null parquet-statistics)) 665 (encoding-stats :documentation 666 "Set of all encodings used for pages in this column chunk. 667 This information can be used to determine if all data pages are 668 dictionary encoded for example * 670 :initarg :encoding-stats :initform nil :type 671 (or null (vector parquet-page-encoding-stats))) 672 (bloom-filter-offset :documentation 673 "Byte offset from beginning of file to Bloom filter data. * 675 :initarg :bloom-filter-offset :initform nil :type 676 (or null (signed-byte 64))) 677 (bloom-filter-length :documentation 678 "Size of Bloom filter data including the serialized header, in bytes. 679 Added in 2.10 so readers may not read this field from old files and 680 it can be obtained after the BloomFilterHeader has been deserialized. 681 Writers should write this field so readers can read the bloom filter 684 :initarg :bloom-filter-length :initform nil :type 685 (or null (signed-byte 32))) 686 (size-statistics :documentation 687 "Optional statistics to help estimate total memory when converted to in-memory 688 representations. The histograms contained in these statistics can 689 also be useful in some cases for more fine-grained nullability\\list length 692 :initarg :size-statistics :initform nil :type 693 (or null parquet-size-statistics))) 694 (:documentation "Description for column metadata 696 (defclass parquet-encryption-with-footer-key (dat/parquet:parquet-object) nil) 697 (defclass parquet-encryption-with-column-key (dat/parquet:parquet-object) 698 ((path-in-schema :documentation "Column path in schema * 700 :initarg :path-in-schema :type (vector string)) 701 (key-metadata :documentation 702 "Retrieval metadata of column encryption key * 704 :initarg :key-metadata :initform nil :type (or null octet-vector)))) 705 (defclass parquet-column-crypto-meta-data (dat/parquet:parquet-object) 706 ((encryption-with-footer-key :initarg :encryption-with-footer-key 707 :initform nil :type (or null parquet-encryption-with-footer-key)) 708 (encryption-with-column-key :initarg :encryption-with-column-key 709 :initform nil :type (or null parquet-encryption-with-column-key)))) 710 (defclass parquet-column-chunk (dat/parquet:parquet-object) 711 ((file-path :documentation 712 "File where column data is stored. If not set, assumed to be same file as 713 metadata. This path is relative to the current file. 716 :initarg :file-path :initform nil :type (or null string)) 717 (file-offset :documentation 718 "Deprecated: Byte offset in file_path to the ColumnMetaData 720 Past use of this field has been inconsistent, with some implementations 721 using it to point to the ColumnMetaData and some using it to point to 722 the first page in the column chunk. In many cases, the ColumnMetaData at this 723 location is wrong. This field is now deprecated and should not be used. 724 Writers should set this field to 0 if no ColumnMetaData has been written outside 727 :initarg :file-offset :type (signed-byte 64)) 728 (meta-data :documentation 729 "Column metadata for this chunk. Some writers may also replicate this at the 730 location pointed to by file_path\\file_offset. 731 Note: while marked as optional, this field is in fact required by most major 732 Parquet implementations. As such, writers MUST populate this field. 735 :initarg :meta-data :initform nil :type 736 (or null parquet-column-meta-data)) 737 (offset-index-offset :documentation 738 "File offset of ColumnChunk's OffsetIndex * 740 :initarg :offset-index-offset :initform nil :type 741 (or null (signed-byte 64))) 742 (offset-index-length :documentation 743 "Size of ColumnChunk's OffsetIndex, in bytes * 745 :initarg :offset-index-length :initform nil :type 746 (or null (signed-byte 32))) 747 (column-index-offset :documentation 748 "File offset of ColumnChunk's ColumnIndex * 750 :initarg :column-index-offset :initform nil :type 751 (or null (signed-byte 64))) 752 (column-index-length :documentation 753 "Size of ColumnChunk's ColumnIndex, in bytes * 755 :initarg :column-index-length :initform nil :type 756 (or null (signed-byte 32))) 757 (crypto-metadata :documentation 758 "Crypto metadata of encrypted columns * 760 :initarg :crypto-metadata :initform nil :type 761 (or null parquet-column-crypto-meta-data)) 762 (encrypted-column-metadata :documentation 763 "Encrypted column metadata for this chunk * 765 :initarg :encrypted-column-metadata :initform nil :type 766 (or null octet-vector)))) 767 (defclass parquet-row-group (dat/parquet:parquet-object) 768 ((columns :documentation 769 "Metadata for each column chunk in this row group. 770 This list must have the same order as the SchemaElement list in FileMetaData. 773 :initarg :columns :type (vector parquet-column-chunk)) 774 (total-byte-size :documentation 775 "Total byte size of all the uncompressed column data in this row group * 777 :initarg :total-byte-size :type (signed-byte 64)) 778 (num-rows :documentation "Number of rows in this row group * 780 :initarg :num-rows :type (signed-byte 64)) 781 (sorting-columns :documentation 782 "If set, specifies a sort ordering of the rows in this RowGroup. 783 The sorting columns can be a subset of all the columns. 785 :initarg :sorting-columns :initform nil :type 786 (or null (vector parquet-sorting-column))) 787 (file-offset :documentation 788 "Byte offset from beginning of file to first page (data or dictionary) 791 :initarg :file-offset :initform nil :type 792 (or null (signed-byte 64))) 793 (total-compressed-size :documentation 794 "Total byte size of all compressed (and potentially encrypted) column data 797 :initarg :total-compressed-size :initform nil :type 798 (or null (signed-byte 64))) 799 (ordinal :documentation "Row group ordinal in the file * 801 :initarg :ordinal :initform nil :type (or null (signed-byte 16))))) 802 (defclass parquet-type-defined-order (dat/parquet:parquet-object) nil 804 "Empty struct to signal the order defined by the physical or logical type 806 (defclass parquet-column-order (dat/parquet:parquet-object) 807 ((type-order :documentation "The sort orders for logical types are: 808 UTF8 - unsigned byte-wise comparison 809 INT8 - signed comparison 810 INT16 - signed comparison 811 INT32 - signed comparison 812 INT64 - signed comparison 813 UINT8 - unsigned comparison 814 UINT16 - unsigned comparison 815 UINT32 - unsigned comparison 816 UINT64 - unsigned comparison 817 DECIMAL - signed comparison of the represented value 818 DATE - signed comparison 819 TIME_MILLIS - signed comparison 820 TIME_MICROS - signed comparison 821 TIMESTAMP_MILLIS - signed comparison 822 TIMESTAMP_MICROS - signed comparison 824 JSON - unsigned byte-wise comparison 825 BSON - unsigned byte-wise comparison 826 ENUM - unsigned byte-wise comparison 830 In the absence of logical types, the sort order is determined by the physical type: 831 BOOLEAN - false, true 832 INT32 - signed comparison 833 INT64 - signed comparison 834 INT96 (only used for legacy timestamps) - undefined 835 FLOAT - signed comparison of the represented value (*) 836 DOUBLE - signed comparison of the represented value (*) 837 BYTE_ARRAY - unsigned byte-wise comparison 838 FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison 840 (*) Because the sorting order is not specified properly for floating 841 point values (relations vs. total ordering) the following 842 compatibility rules should be applied when reading statistics: 843 - If the min is a NaN, it should be ignored. 844 - If the max is a NaN, it should be ignored. 845 - If the min is +0, the row group may contain -0 values as well. 846 - If the max is -0, the row group may contain +0 values as well. 847 - When looking for NaN values, min and max should be ignored. 849 When writing statistics the following rules should be followed: 850 - NaNs should not be written to min or max statistics fields. 851 - If the computed max value is zero (whether negative or positive), 852 `+0.0` should be written into the max statistics field. 853 - If the computed min value is zero (whether negative or positive), 854 `-0.0` should be written into the min statistics field. 856 :initarg :type-order :initform nil :type 857 (or null parquet-type-defined-order))) 859 "Union to specify the order used for the min_value and max_value fields for a 860 column. This union takes the role of an enhanced enum that allows rich 861 elements (which will be needed for a collation-based ordering in the future). 864 * TypeDefinedOrder - the column uses the order defined by its logical or 865 physical type (if there is no logical type). 867 If the reader does not support the value of this union, min and max stats 868 for this column should be ignored. 870 (defclass parquet-page-location (dat/parquet:parquet-object) 871 ((offset :documentation "Offset of the page in the file * 873 :initarg :offset :type (signed-byte 64)) 874 (compressed-page-size :documentation 875 "Size of the page, including header. Sum of compressed_page_size and header 878 :initarg :compressed-page-size :type (signed-byte 32)) 879 (first-row-index :documentation 880 "Index within the RowGroup of the first row of the page. When an 881 OffsetIndex is present, pages must begin on row boundaries 882 (repetition_level = 0). 884 :initarg :first-row-index :type (signed-byte 64)))) 885 (defclass parquet-offset-index (dat/parquet:parquet-object) 886 ((page-locations :documentation 887 "PageLocations, ordered by increasing PageLocation.offset. It is required 888 that page_locations[i].first_row_index < page_locations[i+1].first_row_index. 890 :initarg :page-locations :type (vector parquet-page-location)) 891 (unencoded-byte-array-data-bytes :documentation 892 "Unencoded\\uncompressed size for BYTE_ARRAY types. 894 See documention for unencoded_byte_array_data_bytes in SizeStatistics for 895 more details on this field. 897 :initarg :unencoded-byte-array-data-bytes :initform nil :type 898 (or null (vector (signed-byte 64))))) 899 (:documentation "Optional offsets for each data page in a ColumnChunk. 901 Forms part of the page index, along with ColumnIndex. 903 OffsetIndex may be present even if ColumnIndex is not. 905 (defclass parquet-column-index (dat/parquet:parquet-object) 906 ((null-pages :documentation 907 "A list of Boolean values to determine the validity of the corresponding 908 min and max values. If true, a page contains only null values, and writers 909 have to set the corresponding entries in min_values and max_values to 910 byte[0], so that all lists have the same length. If false, the 911 corresponding entries in min_values and max_values must be valid. 913 :initarg :null-pages :type (vector boolean)) 914 (min-values :documentation 915 "Two lists containing lower and upper bounds for the values of each page 916 determined by the ColumnOrder of the column. These may be the actual 917 minimum and maximum values found on a page, but can also be (more compact) 918 values that do not exist on a page. For example, instead of storing \\\\Blart 919 Versenwald III\\, a writer may set min_values[i]=\\B\\, max_values[i]=\\C\\. 920 Such more compact values must still be valid values within the column's 921 logical type. Readers must make sure that list entries are populated before 922 using them by inspecting null_pages. 924 :initarg :min-values :type (vector octet-vector)) 925 (max-values :initarg :max-values :type (vector octet-vector)) 926 (boundary-order :documentation 927 "Stores whether both min_values and max_values are ordered and if so, in 928 which direction. This allows readers to perform binary searches in both 929 lists. Readers cannot assume that max_values[i] <= min_values[i+1], even 930 if the lists are ordered. 932 :initarg :boundary-order :type parquet-boundary-order) 933 (null-counts :documentation 934 "A list containing the number of null values for each page * 936 :initarg :null-counts :initform nil :type 937 (or null (vector (signed-byte 64)))) 938 (repetition-level-histograms :documentation 939 "Contains repetition level histograms for each page 940 concatenated together. The repetition_level_histogram field on 941 SizeStatistics contains more details. 943 When present the length should always be (number of pages * 944 (max_repetition_level + 1)) elements. 946 Element 0 is the first element of the histogram for the first page. 947 Element (max_repetition_level + 1) is the first element of the histogram 951 :initarg :repetition-level-histograms :initform nil :type 952 (or null (vector (signed-byte 64)))) 953 (definition-level-histograms :documentation 954 "Same as repetition_level_histograms except for definitions levels. 957 :initarg :definition-level-histograms :initform nil :type 958 (or null (vector (signed-byte 64))))) 960 "Optional statistics for each data page in a ColumnChunk. 962 Forms part the page index, along with OffsetIndex. 964 If this structure is present, OffsetIndex must also be present. 966 For each field in this structure, <field>[i] refers to the page at 967 OffsetIndex.page_locations[i] 969 (defclass parquet-aes-gcm-v1 (dat/parquet:parquet-object) 970 ((aad-prefix :documentation "AAD prefix * 972 :initarg :aad-prefix :initform nil :type (or null octet-vector)) 973 (aad-file-unique :documentation 974 "Unique file identifier part of AAD suffix * 976 :initarg :aad-file-unique :initform nil :type 977 (or null octet-vector)) 978 (supply-aad-prefix :documentation 979 "In files encrypted with AAD prefix without storing it, 980 readers must supply the prefix * 982 :initarg :supply-aad-prefix :initform nil :type (or null boolean)))) 983 (defclass parquet-aes-gcm-ctr-v1 (dat/parquet:parquet-object) 984 ((aad-prefix :documentation "AAD prefix * 986 :initarg :aad-prefix :initform nil :type (or null octet-vector)) 987 (aad-file-unique :documentation 988 "Unique file identifier part of AAD suffix * 990 :initarg :aad-file-unique :initform nil :type 991 (or null octet-vector)) 992 (supply-aad-prefix :documentation 993 "In files encrypted with AAD prefix without storing it, 994 readers must supply the prefix * 996 :initarg :supply-aad-prefix :initform nil :type (or null boolean)))) 997 (defclass parquet-encryption-algorithm (dat/parquet:parquet-object) 998 ((aes-gcm-v1 :initarg :aes-gcm-v1 :initform nil :type 999 (or null parquet-aes-gcm-v1)) 1000 (aes-gcm-ctr-v1 :initarg :aes-gcm-ctr-v1 :initform nil :type 1001 (or null parquet-aes-gcm-ctr-v1)))) 1002 (defclass parquet-file-meta-data (dat/parquet:parquet-object) 1003 ((version :documentation "Version of this file * 1005 :initarg :version :type (signed-byte 32)) 1006 (schema :documentation 1007 "Parquet schema for this file. This schema contains metadata for all the columns. 1008 The schema is represented as a tree with a single root. The nodes of the tree 1009 are flattened to a list by doing a depth-first traversal. 1010 The column metadata contains the path in the schema for that column which can be 1011 used to map columns to nodes in the schema. 1012 The first element is the root * 1014 :initarg :schema :type (vector parquet-schema-element)) 1015 (num-rows :documentation "Number of rows in this file * 1017 :initarg :num-rows :type (signed-byte 64)) 1018 (row-groups :documentation "Row groups in this file * 1020 :initarg :row-groups :type (vector parquet-row-group)) 1021 (key-value-metadata :documentation "Optional key\\value metadata * 1023 :initarg :key-value-metadata :initform nil :type 1024 (or null (vector parquet-key-value))) 1025 (created-by :documentation 1026 "String for application that wrote this file. This should be in the format 1027 <Application> version <App Version> (build <App Build Hash>). 1028 e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) 1031 :initarg :created-by :initform nil :type (or null string)) 1032 (column-orders :documentation 1033 "Sort order used for the min_value and max_value fields in the Statistics 1034 objects and the min_values and max_values fields in the ColumnIndex 1035 objects of each column in this file. Sort orders are listed in the order 1036 matching the columns in the schema. The indexes are not necessary the same 1037 though, because only leaf nodes of the schema are represented in the list 1040 Without column_orders, the meaning of the min_value and max_value fields 1041 in the Statistics object and the ColumnIndex object is undefined. To ensure 1042 well-defined behaviour, if these fields are written to a Parquet file, 1043 column_orders must be written as well. 1045 The obsolete min and max fields in the Statistics object are always sorted 1046 by signed comparison regardless of column_orders. 1048 :initarg :column-orders :initform nil :type 1049 (or null (vector parquet-column-order))) 1050 (encryption-algorithm :documentation 1051 "Encryption algorithm. This field is set only in encrypted files 1052 with plaintext footer. Files with encrypted footer store algorithm id 1053 in FileCryptoMetaData structure. 1055 :initarg :encryption-algorithm :initform nil :type 1056 (or null parquet-encryption-algorithm)) 1057 (footer-signing-key-metadata :documentation 1058 "Retrieval metadata of key used for signing the footer. 1059 Used only in encrypted files with plaintext footer. 1061 :initarg :footer-signing-key-metadata :initform nil :type 1062 (or null octet-vector))) 1063 (:documentation "Description for file metadata 1065 (defclass parquet-file-crypto-meta-data (dat/parquet:parquet-object) 1066 ((encryption-algorithm :documentation 1067 "Encryption algorithm. This field is only used for files 1068 with encrypted footer. Files with plaintext footer store algorithm id 1069 inside footer (FileMetaData structure). 1071 :initarg :encryption-algorithm :type parquet-encryption-algorithm) 1072 (key-metadata :documentation 1073 "Retrieval metadata of key used for encryption of footer, 1074 and (possibly) columns * 1076 :initarg :key-metadata :initform nil :type (or null octet-vector))) 1077 (:documentation "Crypto metadata for files with encrypted footer *