Mercurial > core / lisp/lib/dat/parquet/thrift.lisp
changeset 698: |
96958d3eb5b0 |
parent: |
642b3b82b20d
|
author: |
Richard Westhaver <ellis@rwest.io> |
date: |
Fri, 04 Oct 2024 22:04:59 -0400 |
permissions: |
-rw-r--r-- |
description: |
fixes |
1 ;;; /home/ellis/comp/core/lisp/lib/dat/parquet/thrift.lisp --- Parquet Thrift Definitions -*- buffer-read-only:t -*- 3 ;; input = /home/ellis/comp/core/.stash/parquet.json 5 ;; This file was generated automatically by 6 ;; DAT/PARQUET/GEN:PARSE-PARQUET-THRIFT-DEFINITIONS 11 (in-package :dat/parquet) 13 (defvar *parquet-json-types* 14 '(:boolean :int32 :int64 :int96 :float :double :byte-array 15 :fixed-len-byte-array)) 16 (defvar *parquet-json-converted-types* 17 '(:utf8 :map :map-key-value :list :enum :decimal :date :time-millis 18 :time-micros :timestamp-millis :timestamp-micros :uint-8 :uint-16 :uint-32 19 :uint-64 :int-8 :int-16 :int-32 :int-64 :json :bson :interval)) 20 (defvar *parquet-json-field-repetition-types* '(:required :optional :repeated)) 21 (defvar *parquet-json-encodings* 22 '(:plain :plain-dictionary :rle :bit-packed :delta-binary-packed 23 :delta-length-byte-array :delta-byte-array :rle-dictionary 25 (defvar *parquet-json-compression-codecs* 26 '(:uncompressed :snappy :gzip :lzo :brotli :lz4 :zstd :lz4-raw)) 27 (defvar *parquet-json-page-types* 28 '(:data-page :index-page :dictionary-page :data-page-v2)) 29 (defvar *parquet-json-boundary-orders* '(:unordered :ascending :descending)) 30 (deftype parquet-boolean () 'boolean) 31 (deftype parquet-int32 () '(signed-byte 32)) 32 (deftype parquet-int64 () '(signed-byte 64)) 33 (deftype parquet-int96 () '(signed-byte 96)) 34 (deftype parquet-float () 'float) 35 (deftype parquet-double () 'double-float) 36 (deftype parquet-byte-array (&optional dat/parquet/gen::size) 37 `(octet-vector ,dat/parquet/gen::size)) 38 (deftype parquet-fixed-len-byte-array (dat/parquet/gen::size) 39 `(octet-vector ,dat/parquet/gen::size)) 40 (defclass parquet-size-statistics (parquet-object) 41 ((unencoded-byte-array-data-bytes :documentation 42 "The number of physical bytes stored for BYTE_ARRAY data values assuming 43 no encoding. This is exclusive of the bytes needed to store the length of 44 each byte array. In other words, this field is equivalent to the `(size 45 of PLAIN-ENCODING the byte array values) - (4 bytes * number of values 46 written)`. To determine unencoded sizes of other types readers can use 47 schema information multiplied by the number of non-null and null values. 48 The number of null\\non-null values can be inferred from the histograms 51 For example, if a column chunk is dictionary-encoded with dictionary 52 [\\a\\, \\bc\\, \\cde\\], and a data page contains the indices [0, 0, 1, 2], 53 then this value for that data page should be 7 (1 + 1 + 2 + 3). 55 This field should only be set for types that use BYTE_ARRAY as their 58 :initarg :unencoded-byte-array-data-bytes :initform nil :type 59 (or null (signed-byte 64))) 60 (repetition-level-histogram :documentation 61 "When present, there is expected to be one element corresponding to each 62 repetition (i.e. size=max repetition_level+1) where each element 63 represents the number of times the repetition level was observed in the 66 This field may be omitted if max_repetition_level is 0 without loss 70 :initarg :repetition-level-histogram :initform nil :type 71 (or null (vector (signed-byte 64)))) 72 (definition-level-histogram :documentation 73 "Same as repetition_level_histogram except for definition levels. 75 This field may be omitted if max_definition_level is 0 or 1 without 79 :initarg :definition-level-histogram :initform nil :type 80 (or null (vector (signed-byte 64))))) 82 "A structure for capturing metadata for estimating the unencoded, 83 uncompressed size of data written. This is useful for readers to estimate 84 how much memory is needed to reconstruct data in their memory model and for 85 fine grained filter pushdown on nested structures (the histograms contained 86 in this structure can help determine the number of nulls at a particular 87 nesting level and maximum length of lists). 89 (defclass parquet-statistics (parquet-object) 91 "DEPRECATED: min and max value of the column. Use min_value and max_value. 93 Values are encoded using PLAIN encoding, except that variable-length byte 94 arrays do not include a length prefix. 96 These fields encode min and max values determined by signed comparison 97 only. New files should use the correct order for a column's logical type 98 and store the values in the min_value and max_value fields. 100 To support older readers, these may be set when the column order is 103 :initarg :max :initform nil :type (or null octet-vector)) 104 (min :initarg :min :initform nil :type (or null octet-vector)) 105 (null-count :documentation "count of null value in the column 107 :initarg :null-count :initform nil :type 108 (or null (signed-byte 64))) 109 (distinct-count :documentation "count of distinct values occurring 111 :initarg :distinct-count :initform nil :type 112 (or null (signed-byte 64))) 113 (max-value :documentation 114 "Lower and upper bound values for the column, determined by its ColumnOrder. 116 These may be the actual minimum and maximum values found on a page or column 117 chunk, but can also be (more compact) values that do not exist on a page or 118 column chunk. For example, instead of storing \\Blart Versenwald III\\, a writer 119 may set min_value=\\B\\, max_value=\\C\\. Such more compact values must still be 120 valid values within the column's logical type. 122 Values are encoded using PLAIN encoding, except that variable-length byte 123 arrays do not include a length prefix. 125 :initarg :max-value :initform nil :type (or null octet-vector)) 126 (min-value :initarg :min-value :initform nil :type 127 (or null octet-vector)) 128 (is-max-value-exact :documentation 129 "If true, max_value is the actual maximum value for a column 131 :initarg :is-max-value-exact :initform nil :type (or null boolean)) 132 (is-min-value-exact :documentation 133 "If true, min_value is the actual minimum value for a column 135 :initarg :is-min-value-exact :initform nil :type 137 (:documentation "Statistics per row group and per page 138 All fields are optional. 140 (defclass parquet-string-type (parquet-object) nil 141 (:documentation "Empty structs to use as logical type annotations 143 (defclass parquet-uuid-type (parquet-object) nil) 144 (defclass parquet-map-type (parquet-object) nil) 145 (defclass parquet-list-type (parquet-object) nil) 146 (defclass parquet-enum-type (parquet-object) nil) 147 (defclass parquet-date-type (parquet-object) nil) 148 (defclass parquet-float16-type (parquet-object) nil) 149 (defclass parquet-null-type (parquet-object) nil 151 "Logical type to annotate a column that is always null. 153 Sometimes when discovering the schema of existing data, values are always 154 null and the physical type can't be determined. This annotation signals 155 the case where the physical type was guessed from all null values. 157 (defclass parquet-decimal-type (parquet-object) 158 ((scale :initarg :scale :type (signed-byte 32)) 159 (precision :initarg :precision :type (signed-byte 32))) 160 (:documentation "Decimal logical type annotation 162 Scale must be zero or a positive integer less than or equal to the precision. 163 Precision must be a non-zero positive integer. 165 To maintain forward-compatibility in v1, implementations using this logical 166 type must also set scale and precision on the annotated SchemaElement. 168 Allowed for physical types: INT32, INT64, FIXED_LEN_BYTE_ARRAY, and BYTE_ARRAY. 170 (defclass parquet-milli-seconds (parquet-object) nil 171 (:documentation "Time units for logical types 173 (defclass parquet-micro-seconds (parquet-object) nil) 174 (defclass parquet-nano-seconds (parquet-object) nil) 175 (defclass parquet-time-unit (parquet-object) 176 ((millis :initarg :millis :initform nil :type 177 (or null parquet-milli-seconds)) 178 (micros :initarg :micros :initform nil :type 179 (or null parquet-micro-seconds)) 180 (nanos :initarg :nanos :initform nil :type 181 (or null parquet-nano-seconds)))) 182 (defclass parquet-timestamp-type (parquet-object) 183 ((isadjustedtoutc :initarg :isadjustedtoutc :type boolean) 184 (unit :initarg :unit :type parquet-time-unit)) 185 (:documentation "Timestamp logical type annotation 187 Allowed for physical types: INT64 189 (defclass parquet-time-type (parquet-object) 190 ((isadjustedtoutc :initarg :isadjustedtoutc :type boolean) 191 (unit :initarg :unit :type parquet-time-unit)) 192 (:documentation "Time logical type annotation 194 Allowed for physical types: INT32 (millis), INT64 (micros, nanos) 196 (defclass parquet-int-type (parquet-object) 197 ((bitwidth :initarg :bitwidth) 198 (issigned :initarg :issigned :type boolean)) 199 (:documentation "Integer logical type annotation 201 bitWidth must be 8, 16, 32, or 64. 203 Allowed for physical types: INT32, INT64 205 (defclass parquet-json-type (parquet-object) nil 206 (:documentation "Embedded JSON logical type annotation 208 Allowed for physical types: BYTE_ARRAY 210 (defclass parquet-bson-type (parquet-object) nil 211 (:documentation "Embedded BSON logical type annotation 213 Allowed for physical types: BYTE_ARRAY 215 (defclass parquet-logical-type (parquet-object) 216 ((string :initarg :string :initform nil :type 217 (or null parquet-string-type)) 218 (map :initarg :map :initform nil :type (or null parquet-map-type)) 219 (list :initarg :list :initform nil :type 220 (or null parquet-list-type)) 221 (enum :initarg :enum :initform nil :type 222 (or null parquet-enum-type)) 223 (decimal :initarg :decimal :initform nil :type 224 (or null parquet-decimal-type)) 225 (date :initarg :date :initform nil :type 226 (or null parquet-date-type)) 233 (or null parquet-time-type)) 234 (timestamp :initarg :timestamp :initform nil :type 235 (or null parquet-timestamp-type)) 236 (integer :initarg :integer :initform nil :type 237 (or null parquet-int-type)) 238 (unknown :initarg :unknown :initform nil :type 239 (or null parquet-null-type)) 240 (json :initarg :json :initform nil :type 241 (or null parquet-json-type)) 242 (bson :initarg :bson :initform nil :type 243 (or null parquet-bson-type)) 244 (uuid :initarg :uuid :initform nil :type 245 (or null parquet-uuid-type)) 246 (float16 :initarg :float16 :initform nil :type 247 (or null parquet-float16-type))) 248 (:documentation "LogicalType annotations to replace ConvertedType. 250 To maintain compatibility, implementations using LogicalType for a 251 SchemaElement must also set the corresponding ConvertedType (if any) 252 from the following table. 254 (defclass parquet-schema-element (parquet-object) 255 ((type :documentation 256 "Data type for this field. Not set if the current element is a non-leaf node 258 :initarg :type :initform nil :type (or null parquet-type)) 259 (type-length :documentation 260 "If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the values. 261 Otherwise, if specified, this is the maximum bit length to store any of the values. 262 (e.g. a low cardinality INT col could have this set to 3). Note that this is 263 in the schema, and therefore fixed for the entire file. 265 :initarg :type-length :initform nil :type 266 (or null (signed-byte 32))) 267 (repetition-type :documentation 268 "repetition of the field. The root of the schema does not have a repetition_type. 269 All other nodes must have one 271 :initarg :repetition-type :initform nil :type 272 (or null parquet-field-repetition-type)) 273 (name :documentation "Name of the field in the schema 275 :initarg :name :type string) 276 (num-children :documentation 277 "Nested fields. Since thrift does not support nested fields, 278 the nesting is flattened to a single list by a depth-first traversal. 279 The children count is used to construct the nested relationship. 280 This field is not set when the element is a primitive type 282 :initarg :num-children :initform nil :type 283 (or null (signed-byte 32))) 284 (converted-type :documentation 285 "DEPRECATED: When the schema is the result of a conversion from another model. 286 Used to record the original type to help with cross conversion. 288 This is superseded by logicalType. 290 :initarg :converted-type :initform nil :type 291 (or null parquet-converted-type)) 292 (scale :documentation 293 "DEPRECATED: Used when this column contains decimal data. 294 See the DECIMAL converted type for more details. 296 This is superseded by using the DecimalType annotation in logicalType. 298 :initarg :scale :initform nil :type (or null (signed-byte 32))) 299 (precision :initarg :precision :initform nil :type 300 (or null (signed-byte 32))) 301 (field-id :documentation 302 "When the original schema supports field ids, this will save the 303 original field id in the parquet schema 305 :initarg :field-id :initform nil :type (or null (signed-byte 32))) 306 (logicaltype :documentation "The logical type of this SchemaElement 308 LogicalType replaces ConvertedType, but ConvertedType is still required 309 for some logical types to ensure forward-compatibility in format v1. 311 :initarg :logicaltype :initform nil :type 312 (or null parquet-logical-type))) 313 (:documentation "Represents a element inside a schema definition. 314 - if it is a group (inner node) then type is undefined and num_children is defined 315 - if it is a primitive type (leaf) then type is defined and num_children is undefined 316 the nodes are listed in depth first traversal order. 318 (defclass parquet-data-page-header (parquet-object) 319 ((num-values :documentation 320 "Number of values, including NULLs, in this data page. 322 If a OffsetIndex is present, a page must begin at a row 323 boundary (repetition_level = 0). Otherwise, pages may begin 324 within a row (repetition_level > 0). 327 :initarg :num-values :type (signed-byte 32)) 328 (encoding :documentation "Encoding used for this data page * 330 :initarg :encoding :type parquet-encoding) 331 (definition-level-encoding :documentation 332 "Encoding used for definition levels * 334 :initarg :definition-level-encoding :type parquet-encoding) 335 (repetition-level-encoding :documentation 336 "Encoding used for repetition levels * 338 :initarg :repetition-level-encoding :type parquet-encoding) 339 (statistics :documentation 340 "Optional statistics for the data in this page * 342 :initarg :statistics :initform nil :type 343 (or null parquet-statistics))) 344 (:documentation "Data page header 346 (defclass parquet-index-page-header (parquet-object) nil) 347 (defclass parquet-dictionary-page-header (parquet-object) 348 ((num-values :documentation "Number of values in the dictionary * 350 :initarg :num-values :type (signed-byte 32)) 351 (encoding :documentation "Encoding using this dictionary page * 353 :initarg :encoding :type parquet-encoding) 354 (is-sorted :documentation 355 "If true, the entries in the dictionary are sorted in ascending order * 357 :initarg :is-sorted :initform nil :type (or null boolean))) 359 "The dictionary page must be placed at the first position of the column chunk 360 if it is partly or completely dictionary encoded. At most one dictionary page 361 can be placed in a column chunk. 364 (defclass parquet-data-page-header-v2 (parquet-object) 365 ((num-values :documentation 366 "Number of values, including NULLs, in this data page. * 368 :initarg :num-values :type (signed-byte 32)) 369 (num-nulls :documentation "Number of NULL values, in this data page. 370 Number of non-null = num_values - num_nulls which is also the number of values in the data section * 372 :initarg :num-nulls :type (signed-byte 32)) 373 (num-rows :documentation 374 "Number of rows in this data page. Every page must begin at a 375 row boundary (repetition_level = 0): rows must **not** be 376 split across page boundaries when using V2 data pages. 379 :initarg :num-rows :type (signed-byte 32)) 380 (encoding :documentation "Encoding used for data in this page * 382 :initarg :encoding :type parquet-encoding) 383 (definition-levels-byte-length :documentation 384 "Length of the definition levels 386 :initarg :definition-levels-byte-length :type (signed-byte 32)) 387 (repetition-levels-byte-length :documentation 388 "Length of the repetition levels 390 :initarg :repetition-levels-byte-length :type (signed-byte 32)) 391 (is-compressed :documentation "Whether the values are compressed. 392 Which means the section of the page between 393 definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) 394 is compressed with the compression_codec. 395 If missing it is considered compressed 397 :initarg :is-compressed :initform nil :type (or null boolean)) 398 (statistics :documentation 399 "Optional statistics for the data in this page * 401 :initarg :statistics :initform nil :type 402 (or null parquet-statistics))) 404 "New page format allowing reading levels without decompressing the data 405 Repetition and definition levels are uncompressed 406 The remaining section containing the data is compressed if is_compressed is true 409 (defclass parquet-split-block-algorithm (parquet-object) nil 410 (:documentation "Block-based algorithm type annotation. * 412 (defclass parquet-bloom-filter-algorithm (parquet-object) 413 ((block :documentation 414 "Block-based Bloom filter. * 421 (or null parquet-split-block-algorithm))) 422 (:documentation "The algorithm used in Bloom filter. * 424 (defclass parquet-xx-hash (parquet-object) nil 426 "Hash strategy type annotation. xxHash is an extremely fast non-cryptographic hash 427 algorithm. It uses 64 bits version of xxHash. 430 (defclass parquet-bloom-filter-hash (parquet-object) 431 ((xxhash :documentation "xxHash Strategy. * 433 :initarg :xxhash :initform nil :type (or null parquet-xx-hash))) 435 "The hash function used in Bloom filter. This function takes the hash of a column value 436 using plain encoding. 439 (defclass parquet-uncompressed (parquet-object) nil 440 (:documentation "The compression used in the Bloom filter. 443 (defclass parquet-bloom-filter-compression (parquet-object) 444 ((uncompressed :initarg :uncompressed :initform nil :type 445 (or null parquet-uncompressed)))) 446 (defclass parquet-bloom-filter-header (parquet-object) 447 ((numbytes :documentation "The size of bitset in bytes * 449 :initarg :numbytes :type (signed-byte 32)) 450 (algorithm :documentation "The algorithm for setting bits. * 452 :initarg :algorithm :type parquet-bloom-filter-algorithm) 453 (hash :documentation "The hash function used for Bloom filter. * 455 :initarg :hash :type parquet-bloom-filter-hash) 456 (compression :documentation 457 "The compression used in the Bloom filter * 459 :initarg :compression :type parquet-bloom-filter-compression)) 461 "Bloom filter header is stored at beginning of Bloom filter data of each column 462 and followed by its bitset. 465 (defclass parquet-page-header (parquet-object) 466 ((type :documentation 467 "the type of the page: indicates which of the *_header fields is set * 469 :initarg :type :type parquet-page-type) 470 (uncompressed-page-size :documentation 471 "Uncompressed page size in bytes (not including this header) * 473 :initarg :uncompressed-page-size :type (signed-byte 32)) 474 (compressed-page-size :documentation 475 "Compressed (and potentially encrypted) page size in bytes, not including this header * 477 :initarg :compressed-page-size :type (signed-byte 32)) 479 "The 32-bit CRC checksum for the page, to be be calculated as follows: 481 - The standard CRC32 algorithm is used (with polynomial 0x04C11DB7, 482 the same as in e.g. GZip). 483 - All page types can have a CRC (v1 and v2 data pages, dictionary pages, 485 - The CRC is computed on the serialization binary representation of the page 486 (as written to disk), excluding the page header. For example, for v1 487 data pages, the CRC is computed on the concatenation of repetition levels, 488 definition levels and column values (optionally compressed, optionally 490 - The CRC computation therefore takes place after any compression 491 and encryption steps, if any. 493 If enabled, this allows for disabling checksumming in HDFS if only a few 494 pages need to be read. 496 :initarg :crc :initform nil :type (or null (signed-byte 32))) 497 (data-page-header :initarg :data-page-header :initform nil :type 498 (or null parquet-data-page-header)) 499 (index-page-header :initarg :index-page-header :initform nil :type 500 (or null parquet-index-page-header)) 501 (dictionary-page-header :initarg :dictionary-page-header :initform 502 nil :type (or null parquet-dictionary-page-header)) 503 (data-page-header-v2 :initarg :data-page-header-v2 :initform nil 504 :type (or null parquet-data-page-header-v2)))) 505 (defclass parquet-key-value (parquet-object) 506 ((key :initarg :key :type string) 507 (value :initarg :value :initform nil :type (or null string))) 508 (:documentation "Wrapper struct to store key values 510 (defclass parquet-sorting-column (parquet-object) 511 ((column-idx :documentation 512 "The ordinal position of the column (in this row group) * 514 :initarg :column-idx :type (signed-byte 32)) 515 (descending :documentation 516 "If true, indicates this column is sorted in descending order. * 518 :initarg :descending :type boolean) 519 (nulls-first :documentation 520 "If true, nulls will come before non-null values, otherwise, 523 :initarg :nulls-first :type boolean)) 524 (:documentation "Sort order within a RowGroup of a leaf column 526 (defclass parquet-page-encoding-stats (parquet-object) 527 ((page-type :documentation "the page type (data\\dic\\...) * 529 :initarg :page-type :type parquet-page-type) 530 (encoding :documentation "encoding of the page * 532 :initarg :encoding :type parquet-encoding) 533 (count :documentation 534 "number of pages of this type with this encoding * 536 :initarg :count :type (signed-byte 32))) 537 (:documentation "statistics of a given page type and encoding 539 (defclass parquet-column-meta-data (parquet-object) 540 ((type :documentation "Type of this column * 542 :initarg :type :type parquet-type) 543 (encodings :documentation 544 "Set of all encodings used for this column. The purpose is to validate 545 whether we can decode those pages. * 547 :initarg :encodings :type (vector parquet-encoding)) 548 (path-in-schema :documentation "Path in schema * 550 :initarg :path-in-schema :type (vector string)) 551 (codec :documentation "Compression codec * 553 :initarg :codec :type parquet-compression-codec) 554 (num-values :documentation "Number of values in this column * 556 :initarg :num-values :type (signed-byte 64)) 557 (total-uncompressed-size :documentation 558 "total byte size of all uncompressed pages in this column chunk (including the headers) * 560 :initarg :total-uncompressed-size :type (signed-byte 64)) 561 (total-compressed-size :documentation 562 "total byte size of all compressed, and potentially encrypted, pages 563 in this column chunk (including the headers) * 565 :initarg :total-compressed-size :type (signed-byte 64)) 566 (key-value-metadata :documentation "Optional key\\value metadata * 568 :initarg :key-value-metadata :initform nil :type 569 (or null (vector parquet-key-value))) 570 (data-page-offset :documentation 571 "Byte offset from beginning of file to first data page * 573 :initarg :data-page-offset :type (signed-byte 64)) 574 (index-page-offset :documentation 575 "Byte offset from beginning of file to root index page * 577 :initarg :index-page-offset :initform nil :type 578 (or null (signed-byte 64))) 579 (dictionary-page-offset :documentation 580 "Byte offset from the beginning of file to first (only) dictionary page * 582 :initarg :dictionary-page-offset :initform nil :type 583 (or null (signed-byte 64))) 584 (statistics :documentation "optional statistics for this column chunk 586 :initarg :statistics :initform nil :type 587 (or null parquet-statistics)) 588 (encoding-stats :documentation 589 "Set of all encodings used for pages in this column chunk. 590 This information can be used to determine if all data pages are 591 dictionary encoded for example * 593 :initarg :encoding-stats :initform nil :type 594 (or null (vector parquet-page-encoding-stats))) 595 (bloom-filter-offset :documentation 596 "Byte offset from beginning of file to Bloom filter data. * 598 :initarg :bloom-filter-offset :initform nil :type 599 (or null (signed-byte 64))) 600 (bloom-filter-length :documentation 601 "Size of Bloom filter data including the serialized header, in bytes. 602 Added in 2.10 so readers may not read this field from old files and 603 it can be obtained after the BloomFilterHeader has been deserialized. 604 Writers should write this field so readers can read the bloom filter 607 :initarg :bloom-filter-length :initform nil :type 608 (or null (signed-byte 32))) 609 (size-statistics :documentation 610 "Optional statistics to help estimate total memory when converted to in-memory 611 representations. The histograms contained in these statistics can 612 also be useful in some cases for more fine-grained nullability\\list length 615 :initarg :size-statistics :initform nil :type 616 (or null parquet-size-statistics))) 617 (:documentation "Description for column metadata 619 (defclass parquet-encryption-with-footer-key (parquet-object) nil) 620 (defclass parquet-encryption-with-column-key (parquet-object) 621 ((path-in-schema :documentation "Column path in schema * 623 :initarg :path-in-schema :type (vector string)) 624 (key-metadata :documentation 625 "Retrieval metadata of column encryption key * 627 :initarg :key-metadata :initform nil :type (or null octet-vector)))) 628 (defclass parquet-column-crypto-meta-data (parquet-object) 629 ((encryption-with-footer-key :initarg :encryption-with-footer-key 630 :initform nil :type (or null parquet-encryption-with-footer-key)) 631 (encryption-with-column-key :initarg :encryption-with-column-key 632 :initform nil :type (or null parquet-encryption-with-column-key)))) 633 (defclass parquet-column-chunk (parquet-object) 634 ((file-path :documentation 635 "File where column data is stored. If not set, assumed to be same file as 636 metadata. This path is relative to the current file. 639 :initarg :file-path :initform nil :type (or null string)) 640 (file-offset :documentation 641 "Deprecated: Byte offset in file_path to the ColumnMetaData 643 Past use of this field has been inconsistent, with some implementations 644 using it to point to the ColumnMetaData and some using it to point to 645 the first page in the column chunk. In many cases, the ColumnMetaData at this 646 location is wrong. This field is now deprecated and should not be used. 647 Writers should set this field to 0 if no ColumnMetaData has been written outside 650 :initarg :file-offset :type (signed-byte 64)) 651 (meta-data :documentation 652 "Column metadata for this chunk. Some writers may also replicate this at the 653 location pointed to by file_path\\file_offset. 654 Note: while marked as optional, this field is in fact required by most major 655 Parquet implementations. As such, writers MUST populate this field. 658 :initarg :meta-data :initform nil :type 659 (or null parquet-column-meta-data)) 660 (offset-index-offset :documentation 661 "File offset of ColumnChunk's OffsetIndex * 663 :initarg :offset-index-offset :initform nil :type 664 (or null (signed-byte 64))) 665 (offset-index-length :documentation 666 "Size of ColumnChunk's OffsetIndex, in bytes * 668 :initarg :offset-index-length :initform nil :type 669 (or null (signed-byte 32))) 670 (column-index-offset :documentation 671 "File offset of ColumnChunk's ColumnIndex * 673 :initarg :column-index-offset :initform nil :type 674 (or null (signed-byte 64))) 675 (column-index-length :documentation 676 "Size of ColumnChunk's ColumnIndex, in bytes * 678 :initarg :column-index-length :initform nil :type 679 (or null (signed-byte 32))) 680 (crypto-metadata :documentation 681 "Crypto metadata of encrypted columns * 683 :initarg :crypto-metadata :initform nil :type 684 (or null parquet-column-crypto-meta-data)) 685 (encrypted-column-metadata :documentation 686 "Encrypted column metadata for this chunk * 688 :initarg :encrypted-column-metadata :initform nil :type 689 (or null octet-vector)))) 690 (defclass parquet-row-group (parquet-object) 691 ((columns :documentation 692 "Metadata for each column chunk in this row group. 693 This list must have the same order as the SchemaElement list in FileMetaData. 696 :initarg :columns :type (vector parquet-column-chunk)) 697 (total-byte-size :documentation 698 "Total byte size of all the uncompressed column data in this row group * 700 :initarg :total-byte-size :type (signed-byte 64)) 701 (num-rows :documentation "Number of rows in this row group * 703 :initarg :num-rows :type (signed-byte 64)) 704 (sorting-columns :documentation 705 "If set, specifies a sort ordering of the rows in this RowGroup. 706 The sorting columns can be a subset of all the columns. 708 :initarg :sorting-columns :initform nil :type 709 (or null (vector parquet-sorting-column))) 710 (file-offset :documentation 711 "Byte offset from beginning of file to first page (data or dictionary) 714 :initarg :file-offset :initform nil :type 715 (or null (signed-byte 64))) 716 (total-compressed-size :documentation 717 "Total byte size of all compressed (and potentially encrypted) column data 720 :initarg :total-compressed-size :initform nil :type 721 (or null (signed-byte 64))) 722 (ordinal :documentation "Row group ordinal in the file * 724 :initarg :ordinal :initform nil :type (or null (signed-byte 16))))) 725 (defclass parquet-type-defined-order (parquet-object) nil 727 "Empty struct to signal the order defined by the physical or logical type 729 (defclass parquet-column-order (parquet-object) 730 ((type-order :documentation "The sort orders for logical types are: 731 UTF8 - unsigned byte-wise comparison 732 INT8 - signed comparison 733 INT16 - signed comparison 734 INT32 - signed comparison 735 INT64 - signed comparison 736 UINT8 - unsigned comparison 737 UINT16 - unsigned comparison 738 UINT32 - unsigned comparison 739 UINT64 - unsigned comparison 740 DECIMAL - signed comparison of the represented value 741 DATE - signed comparison 742 TIME_MILLIS - signed comparison 743 TIME_MICROS - signed comparison 744 TIMESTAMP_MILLIS - signed comparison 745 TIMESTAMP_MICROS - signed comparison 747 JSON - unsigned byte-wise comparison 748 BSON - unsigned byte-wise comparison 749 ENUM - unsigned byte-wise comparison 753 In the absence of logical types, the sort order is determined by the physical type: 754 BOOLEAN - false, true 755 INT32 - signed comparison 756 INT64 - signed comparison 757 INT96 (only used for legacy timestamps) - undefined 758 FLOAT - signed comparison of the represented value (*) 759 DOUBLE - signed comparison of the represented value (*) 760 BYTE_ARRAY - unsigned byte-wise comparison 761 FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison 763 (*) Because the sorting order is not specified properly for floating 764 point values (relations vs. total ordering) the following 765 compatibility rules should be applied when reading statistics: 766 - If the min is a NaN, it should be ignored. 767 - If the max is a NaN, it should be ignored. 768 - If the min is +0, the row group may contain -0 values as well. 769 - If the max is -0, the row group may contain +0 values as well. 770 - When looking for NaN values, min and max should be ignored. 772 When writing statistics the following rules should be followed: 773 - NaNs should not be written to min or max statistics fields. 774 - If the computed max value is zero (whether negative or positive), 775 `+0.0` should be written into the max statistics field. 776 - If the computed min value is zero (whether negative or positive), 777 `-0.0` should be written into the min statistics field. 779 :initarg :type-order :initform nil :type 780 (or null parquet-type-defined-order))) 782 "Union to specify the order used for the min_value and max_value fields for a 783 column. This union takes the role of an enhanced enum that allows rich 784 elements (which will be needed for a collation-based ordering in the future). 787 * TypeDefinedOrder - the column uses the order defined by its logical or 788 physical type (if there is no logical type). 790 If the reader does not support the value of this union, min and max stats 791 for this column should be ignored. 793 (defclass parquet-page-location (parquet-object) 794 ((offset :documentation "Offset of the page in the file * 796 :initarg :offset :type (signed-byte 64)) 797 (compressed-page-size :documentation 798 "Size of the page, including header. Sum of compressed_page_size and header 801 :initarg :compressed-page-size :type (signed-byte 32)) 802 (first-row-index :documentation 803 "Index within the RowGroup of the first row of the page. When an 804 OffsetIndex is present, pages must begin on row boundaries 805 (repetition_level = 0). 807 :initarg :first-row-index :type (signed-byte 64)))) 808 (defclass parquet-offset-index (parquet-object) 809 ((page-locations :documentation 810 "PageLocations, ordered by increasing PageLocation.offset. It is required 811 that page_locations[i].first_row_index < page_locations[i+1].first_row_index. 813 :initarg :page-locations :type (vector parquet-page-location)) 814 (unencoded-byte-array-data-bytes :documentation 815 "Unencoded\\uncompressed size for BYTE_ARRAY types. 817 See documention for unencoded_byte_array_data_bytes in SizeStatistics for 818 more details on this field. 820 :initarg :unencoded-byte-array-data-bytes :initform nil :type 821 (or null (vector (signed-byte 64))))) 822 (:documentation "Optional offsets for each data page in a ColumnChunk. 824 Forms part of the page index, along with ColumnIndex. 826 OffsetIndex may be present even if ColumnIndex is not. 828 (defclass parquet-column-index (parquet-object) 829 ((null-pages :documentation 830 "A list of Boolean values to determine the validity of the corresponding 831 min and max values. If true, a page contains only null values, and writers 832 have to set the corresponding entries in min_values and max_values to 833 byte[0], so that all lists have the same length. If false, the 834 corresponding entries in min_values and max_values must be valid. 836 :initarg :null-pages :type (vector boolean)) 837 (min-values :documentation 838 "Two lists containing lower and upper bounds for the values of each page 839 determined by the ColumnOrder of the column. These may be the actual 840 minimum and maximum values found on a page, but can also be (more compact) 841 values that do not exist on a page. For example, instead of storing \\\\Blart 842 Versenwald III\\, a writer may set min_values[i]=\\B\\, max_values[i]=\\C\\. 843 Such more compact values must still be valid values within the column's 844 logical type. Readers must make sure that list entries are populated before 845 using them by inspecting null_pages. 847 :initarg :min-values :type (vector octet-vector)) 848 (max-values :initarg :max-values :type (vector octet-vector)) 849 (boundary-order :documentation 850 "Stores whether both min_values and max_values are ordered and if so, in 851 which direction. This allows readers to perform binary searches in both 852 lists. Readers cannot assume that max_values[i] <= min_values[i+1], even 853 if the lists are ordered. 855 :initarg :boundary-order :type parquet-boundary-order) 856 (null-counts :documentation 857 "A list containing the number of null values for each page * 859 :initarg :null-counts :initform nil :type 860 (or null (vector (signed-byte 64)))) 861 (repetition-level-histograms :documentation 862 "Contains repetition level histograms for each page 863 concatenated together. The repetition_level_histogram field on 864 SizeStatistics contains more details. 866 When present the length should always be (number of pages * 867 (max_repetition_level + 1)) elements. 869 Element 0 is the first element of the histogram for the first page. 870 Element (max_repetition_level + 1) is the first element of the histogram 874 :initarg :repetition-level-histograms :initform nil :type 875 (or null (vector (signed-byte 64)))) 876 (definition-level-histograms :documentation 877 "Same as repetition_level_histograms except for definitions levels. 880 :initarg :definition-level-histograms :initform nil :type 881 (or null (vector (signed-byte 64))))) 883 "Optional statistics for each data page in a ColumnChunk. 885 Forms part the page index, along with OffsetIndex. 887 If this structure is present, OffsetIndex must also be present. 889 For each field in this structure, <field>[i] refers to the page at 890 OffsetIndex.page_locations[i] 892 (defclass parquet-aes-gcm-v1 (parquet-object) 893 ((aad-prefix :documentation "AAD prefix * 895 :initarg :aad-prefix :initform nil :type (or null octet-vector)) 896 (aad-file-unique :documentation 897 "Unique file identifier part of AAD suffix * 899 :initarg :aad-file-unique :initform nil :type 900 (or null octet-vector)) 901 (supply-aad-prefix :documentation 902 "In files encrypted with AAD prefix without storing it, 903 readers must supply the prefix * 905 :initarg :supply-aad-prefix :initform nil :type (or null boolean)))) 906 (defclass parquet-aes-gcm-ctr-v1 (parquet-object) 907 ((aad-prefix :documentation "AAD prefix * 909 :initarg :aad-prefix :initform nil :type (or null octet-vector)) 910 (aad-file-unique :documentation 911 "Unique file identifier part of AAD suffix * 913 :initarg :aad-file-unique :initform nil :type 914 (or null octet-vector)) 915 (supply-aad-prefix :documentation 916 "In files encrypted with AAD prefix without storing it, 917 readers must supply the prefix * 919 :initarg :supply-aad-prefix :initform nil :type (or null boolean)))) 920 (defclass parquet-encryption-algorithm (parquet-object) 921 ((aes-gcm-v1 :initarg :aes-gcm-v1 :initform nil :type 922 (or null parquet-aes-gcm-v1)) 923 (aes-gcm-ctr-v1 :initarg :aes-gcm-ctr-v1 :initform nil :type 924 (or null parquet-aes-gcm-ctr-v1)))) 925 (defclass parquet-file-meta-data (parquet-object) 926 ((version :documentation "Version of this file * 928 :initarg :version :type (signed-byte 32)) 929 (schema :documentation 930 "Parquet schema for this file. This schema contains metadata for all the columns. 931 The schema is represented as a tree with a single root. The nodes of the tree 932 are flattened to a list by doing a depth-first traversal. 933 The column metadata contains the path in the schema for that column which can be 934 used to map columns to nodes in the schema. 935 The first element is the root * 937 :initarg :schema :type (vector parquet-schema-element)) 938 (num-rows :documentation "Number of rows in this file * 940 :initarg :num-rows :type (signed-byte 64)) 941 (row-groups :documentation "Row groups in this file * 943 :initarg :row-groups :type (vector parquet-row-group)) 944 (key-value-metadata :documentation "Optional key\\value metadata * 946 :initarg :key-value-metadata :initform nil :type 947 (or null (vector parquet-key-value))) 948 (created-by :documentation 949 "String for application that wrote this file. This should be in the format 950 <Application> version <App Version> (build <App Build Hash>). 951 e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) 954 :initarg :created-by :initform nil :type (or null string)) 955 (column-orders :documentation 956 "Sort order used for the min_value and max_value fields in the Statistics 957 objects and the min_values and max_values fields in the ColumnIndex 958 objects of each column in this file. Sort orders are listed in the order 959 matching the columns in the schema. The indexes are not necessary the same 960 though, because only leaf nodes of the schema are represented in the list 963 Without column_orders, the meaning of the min_value and max_value fields 964 in the Statistics object and the ColumnIndex object is undefined. To ensure 965 well-defined behaviour, if these fields are written to a Parquet file, 966 column_orders must be written as well. 968 The obsolete min and max fields in the Statistics object are always sorted 969 by signed comparison regardless of column_orders. 971 :initarg :column-orders :initform nil :type 972 (or null (vector parquet-column-order))) 973 (encryption-algorithm :documentation 974 "Encryption algorithm. This field is set only in encrypted files 975 with plaintext footer. Files with encrypted footer store algorithm id 976 in FileCryptoMetaData structure. 978 :initarg :encryption-algorithm :initform nil :type 979 (or null parquet-encryption-algorithm)) 980 (footer-signing-key-metadata :documentation 981 "Retrieval metadata of key used for signing the footer. 982 Used only in encrypted files with plaintext footer. 984 :initarg :footer-signing-key-metadata :initform nil :type 985 (or null octet-vector))) 986 (:documentation "Description for file metadata 988 (defclass parquet-file-crypto-meta-data (parquet-object) 989 ((encryption-algorithm :documentation 990 "Encryption algorithm. This field is only used for files 991 with encrypted footer. Files with plaintext footer store algorithm id 992 inside footer (FileMetaData structure). 994 :initarg :encryption-algorithm :type parquet-encryption-algorithm) 995 (key-metadata :documentation 996 "Retrieval metadata of key used for encryption of footer, 997 and (possibly) columns * 999 :initarg :key-metadata :initform nil :type (or null octet-vector))) 1000 (:documentation "Crypto metadata for files with encrypted footer *