Mercurial > core / lisp/lib/dat/parquet/tcompact.lisp
changeset 550: |
4d34907c69eb |
parent: |
32bd859533b3
|
author: |
Richard Westhaver <ellis@rwest.io> |
date: |
Tue, 16 Jul 2024 21:52:09 -0400 |
permissions: |
-rw-r--r-- |
description: |
more work on tcompact/thrift, fixed type info in parquet-struct-objects |
1 ;;; tcompact.lisp --- Thrift Compact Protocol 3 ;; ref: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md 7 ;; in order to encode Parquet, we need to be able to encode the Thrift Compact 8 ;; Protocol (TCompact). All thrift structures we've generated via parquet.json 9 ;; are serialized using TCompact. 11 ;; see also: https://thrift.apache.org/static/files/thrift-20070401.pdf 14 (in-package :dat/parquet) 18 (defclass thrift-object (id) ()) 20 (defgeneric thrift-element-type (self) 21 (:method ((self parquet-struct-object)) :struct)) 23 (defgeneric thrift-object-length (self)) 28 50399 = 11000100 11011111 (LSB) 29 = 0000011 0001001 1011111 (7-bit groups) 30 = 00000011 10001001 11011111 (add continuation bits) 31 = 0x03 0x89 0xDF (hex) 32 → 0xDF 0x89 0x03 (write to ram LSB first) 35 ;; encoded as ULEB128. signed and unsigned bytes are encoded as single 36 ;; bytes. all others are coverted to int64. 39 (logxor (ash n 1) (ash n -63))) 43 (logxor (ash n -1) (- (logand n 1)))) 45 (defun tcompact-encode-integer (n &optional (size 8)) 47 (if (<= (integer-length n) 8) 49 (encode-uleb128 (zigzag n) size))) 53 ;; ordinal value encoded as int32 54 (defun tcompact-encode-enum (n) 55 (tcompact-encode-integer n 4)) 60 Binary protocol, binary data, 1+ bytes: 61 +--------+...+--------+--------+...+--------+ 62 | byte length | bytes | 63 +--------+...+--------+--------+...+--------+ 66 ;; a varint followed by the bytes 67 (defun tcompact-encode-octet-vector (octets) 68 (concatenate 'octet-vector 69 (tcompact-encode-integer (length octets)) 74 ;; encoded as UTF-8 bytes without null-termination 75 (defun tcompact-encode-string (string) 76 (sb-ext:string-to-octets string :external-format :utf-8)) 79 (defun tcompact-encode-double (float) 80 (tcompact-encode-integer (encode-float32 float))) 84 (defun tcompact-encode-boolean (bool) 89 ;; always 16 bytes, no length header 90 (defun tcompact-encode-uuid (uuid) 91 (declare (obj/uuid:uuid uuid)) 92 (obj/uuid:uuid-to-octet-vector uuid)) 96 ;; struct ::= ( field-header field-value )* stop-field 97 ;; field-header ::= field-type field-id 100 Compact protocol field header (short form) and field value: 101 +--------+--------+...+--------+ 102 |ddddtttt| field value | 103 +--------+--------+...+--------+ 105 Compact protocol field header (1 to 3 bytes, long form) and field value: 106 +--------+--------+...+--------+--------+...+--------+ 107 |0000tttt| field id | field value | 108 +--------+--------+...+--------+--------+...+--------+ 110 Compact protocol stop field: 116 ;; sequences of zero or more 'fields' followed by a stop field. 118 ;; each field starts with a field header and is followed by the encoded field 121 ;; the field-id is represented in Lisp via OBJ/ID. 123 ;; note that it is possible to handle unknown fields while decoding. in the 124 ;; usual case these are ignored. 126 (declaim ((unsigned-byte 8) +tcompact-stop-field+)) 127 (defconstant +tcompact-stop-field+ 0) 128 (deftype tcompact-field-id () '(integer 0 32767)) 129 (deftype tcompact-field-id-delta () '(unsigned-byte 4)) 130 (deftype tcompact-field-type-id () '(unsigned-byte 4)) 132 (defvar *tcompact-field-types* 133 #(:true :false :i8 :i16 :i32 :i64 :double :binary :list :set :map :struct :uuid)) 134 (defun tcompact-field-type-id* (n) (1+ (aref *tcompact-field-types* n))) 135 (defun tcompact-field-type-id (k) (1+ (position k *tcompact-field-types*))) 137 ;; (ldb (byte 4 0) n) 138 (defun tcompact-encode-field-header-short (id-delta type-id) 139 (dpb type-id (byte 4 4) 140 (dpb id-delta (byte 4 0) 0))) 142 (defun tcompact-encode-field-id (id) 143 (tcompact-encode-integer id)) 145 (defun tcompact-encode-field-header (field) 146 (let ((ret (make-array 5 :element-type '(unsigned-byte 8) :fill-pointer 0))) 147 (vector-push (tcompact-encode-field-header-short 0 (tcompact-field-type-id* field)) 149 (loop for x across (tcompact-encode-field-id (id field)) 150 do (vector-push x ret) 151 finally (return ret)))) 153 (defun tcompact-encode-field-value (field)) 155 (defun tcompact-encode-struct (struct)) 157 ;; field-id-delta = current-field-id - previous-field-id 162 Compact protocol list header (1 byte, short form) and elements: 163 +--------+--------+...+--------+ 164 |sssstttt| elements | 165 +--------+--------+...+--------+ 167 Compact protocol list header (2+ bytes, long form) and elements: 168 +--------+--------+...+--------+--------+...+--------+ 169 |1111tttt| size | elements | 170 +--------+--------+...+--------+--------+...+--------+ 173 (deftype tcompact-element-type-id () '(unsigned-byte 4)) 174 ;; tcompact short size = [0,14] 176 (defvar *tcompact-element-types* 177 #(:bool :i8 :i16 :i32 :i64 :double :binary :list :set :map :struct :uuid)) 179 (defun tcompact-element-type-id* (n) (+ (aref *tcompact-element-types* n) 2)) 180 (defun tcompact-element-type-id (k) (+ (position k *tcompact-element-types*) 2)) 182 (defun tcompact-encode-list-header-short (size elt-type) 183 (dpb elt-type (byte 4 4) 184 (dpb size (byte 4 0) 0))) 186 (defun tcompact-encode-list-header (list) 187 (let ((ret (make-array 5 :element-type '(unsigned-byte 8) :fill-pointer 0))) 188 (vector-push (tcompact-encode-list-header-short #xf (id list)) ret) 189 (loop for x across (tcompact-encode-integer (thrift-object-length list) 4) 190 do (vector-push x ret) 191 finally (return ret)))) 193 (defun tcompact-encode-list-element (type value)) 197 ;; map ::= empty-map | non-empty-map 199 ;; non-empty-map ::= size key-element-type value-element-type (key value)+ 202 Compact protocol map header (1 byte, empty map): 207 Compact protocol map header (2+ bytes, non empty map) and key value pairs: 208 +--------+...+--------+--------+--------+...+--------+ 209 | size |kkkkvvvv| key value pairs | 210 +--------+...+--------+--------+--------+...+--------+ 213 (defun tcompact-encode-map ())