changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / lisp/lib/dat/parquet/tcompact.lisp

changeset 550: 4d34907c69eb
parent: 32bd859533b3
author: Richard Westhaver <ellis@rwest.io>
date: Tue, 16 Jul 2024 21:52:09 -0400
permissions: -rw-r--r--
description: more work on tcompact/thrift, fixed type info in parquet-struct-objects
1 ;;; tcompact.lisp --- Thrift Compact Protocol
2 
3 ;; ref: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
4 
5 ;;; Commentary:
6 
7 ;; in order to encode Parquet, we need to be able to encode the Thrift Compact
8 ;; Protocol (TCompact). All thrift structures we've generated via parquet.json
9 ;; are serialized using TCompact.
10 
11 ;; see also: https://thrift.apache.org/static/files/thrift-20070401.pdf
12 
13 ;;; Code:
14 (in-package :dat/parquet)
15 
16 ;;; Protocol
17 
18 (defclass thrift-object (id) ())
19 
20 (defgeneric thrift-element-type (self)
21  (:method ((self parquet-struct-object)) :struct))
22 
23 (defgeneric thrift-object-length (self))
24 
25 ;;; Integers
26 
27 #|
28 50399 = 11000100 11011111 (LSB)
29  = 0000011 0001001 1011111 (7-bit groups)
30  = 00000011 10001001 11011111 (add continuation bits)
31  = 0x03 0x89 0xDF (hex)
32 → 0xDF 0x89 0x03 (write to ram LSB first)
33 |#
34 
35 ;; encoded as ULEB128. signed and unsigned bytes are encoded as single
36 ;; bytes. all others are coverted to int64.
37 (defun zigzag (n)
38  (declare (integer n))
39  (logxor (ash n 1) (ash n -63)))
40 
41 (defun zagzig (n)
42  (declare (integer n))
43  (logxor (ash n -1) (- (logand n 1))))
44 
45 (defun tcompact-encode-integer (n &optional (size 8))
46  (declare (integer n))
47  (if (<= (integer-length n) 8)
48  (vector n)
49  (encode-uleb128 (zigzag n) size)))
50 
51 ;;; Enums
52 
53 ;; ordinal value encoded as int32
54 (defun tcompact-encode-enum (n)
55  (tcompact-encode-integer n 4))
56 
57 ;;; Binary
58 
59 #|
60 Binary protocol, binary data, 1+ bytes:
61 +--------+...+--------+--------+...+--------+
62 | byte length | bytes |
63 +--------+...+--------+--------+...+--------+
64 |#
65 
66 ;; a varint followed by the bytes
67 (defun tcompact-encode-octet-vector (octets)
68  (concatenate 'octet-vector
69  (tcompact-encode-integer (length octets))
70  octets))
71 
72 ;;; String
73 
74 ;; encoded as UTF-8 bytes without null-termination
75 (defun tcompact-encode-string (string)
76  (sb-ext:string-to-octets string :external-format :utf-8))
77 
78 ;;; Double
79 (defun tcompact-encode-double (float)
80  (tcompact-encode-integer (encode-float32 float)))
81 
82 ;;; Boolean
83 
84 (defun tcompact-encode-boolean (bool)
85  (if bool 1 0))
86 
87 ;;; UUID
88 
89 ;; always 16 bytes, no length header
90 (defun tcompact-encode-uuid (uuid)
91  (declare (obj/uuid:uuid uuid))
92  (obj/uuid:uuid-to-octet-vector uuid))
93 
94 ;;; Structs
95 
96 ;; struct ::= ( field-header field-value )* stop-field
97 ;; field-header ::= field-type field-id
98 
99 #|
100 Compact protocol field header (short form) and field value:
101 +--------+--------+...+--------+
102 |ddddtttt| field value |
103 +--------+--------+...+--------+
104 
105 Compact protocol field header (1 to 3 bytes, long form) and field value:
106 +--------+--------+...+--------+--------+...+--------+
107 |0000tttt| field id | field value |
108 +--------+--------+...+--------+--------+...+--------+
109 
110 Compact protocol stop field:
111 +--------+
112 |00000000|
113 +--------+
114 |#
115 
116 ;; sequences of zero or more 'fields' followed by a stop field.
117 
118 ;; each field starts with a field header and is followed by the encoded field
119 ;; value.
120 
121 ;; the field-id is represented in Lisp via OBJ/ID.
122 
123 ;; note that it is possible to handle unknown fields while decoding. in the
124 ;; usual case these are ignored.
125 
126 (declaim ((unsigned-byte 8) +tcompact-stop-field+))
127 (defconstant +tcompact-stop-field+ 0)
128 (deftype tcompact-field-id () '(integer 0 32767))
129 (deftype tcompact-field-id-delta () '(unsigned-byte 4))
130 (deftype tcompact-field-type-id () '(unsigned-byte 4))
131 
132 (defvar *tcompact-field-types*
133  #(:true :false :i8 :i16 :i32 :i64 :double :binary :list :set :map :struct :uuid))
134 (defun tcompact-field-type-id* (n) (1+ (aref *tcompact-field-types* n)))
135 (defun tcompact-field-type-id (k) (1+ (position k *tcompact-field-types*)))
136 
137 ;; (ldb (byte 4 0) n)
138 (defun tcompact-encode-field-header-short (id-delta type-id)
139  (dpb type-id (byte 4 4)
140  (dpb id-delta (byte 4 0) 0)))
141 
142 (defun tcompact-encode-field-id (id)
143  (tcompact-encode-integer id))
144 
145 (defun tcompact-encode-field-header (field)
146  (let ((ret (make-array 5 :element-type '(unsigned-byte 8) :fill-pointer 0)))
147  (vector-push (tcompact-encode-field-header-short 0 (tcompact-field-type-id* field))
148  ret)
149  (loop for x across (tcompact-encode-field-id (id field))
150  do (vector-push x ret)
151  finally (return ret))))
152 
153 (defun tcompact-encode-field-value (field))
154 
155 (defun tcompact-encode-struct (struct))
156 
157  ;; field-id-delta = current-field-id - previous-field-id
158 
159 ;;; List and Set
160 
161 #|
162 Compact protocol list header (1 byte, short form) and elements:
163 +--------+--------+...+--------+
164 |sssstttt| elements |
165 +--------+--------+...+--------+
166 
167 Compact protocol list header (2+ bytes, long form) and elements:
168 +--------+--------+...+--------+--------+...+--------+
169 |1111tttt| size | elements |
170 +--------+--------+...+--------+--------+...+--------+
171 |#
172 
173 (deftype tcompact-element-type-id () '(unsigned-byte 4))
174 ;; tcompact short size = [0,14]
175 
176 (defvar *tcompact-element-types*
177  #(:bool :i8 :i16 :i32 :i64 :double :binary :list :set :map :struct :uuid))
178 
179 (defun tcompact-element-type-id* (n) (+ (aref *tcompact-element-types* n) 2))
180 (defun tcompact-element-type-id (k) (+ (position k *tcompact-element-types*) 2))
181 
182 (defun tcompact-encode-list-header-short (size elt-type)
183  (dpb elt-type (byte 4 4)
184  (dpb size (byte 4 0) 0)))
185 
186 (defun tcompact-encode-list-header (list)
187  (let ((ret (make-array 5 :element-type '(unsigned-byte 8) :fill-pointer 0)))
188  (vector-push (tcompact-encode-list-header-short #xf (id list)) ret)
189  (loop for x across (tcompact-encode-integer (thrift-object-length list) 4)
190  do (vector-push x ret)
191  finally (return ret))))
192 
193 (defun tcompact-encode-list-element (type value))
194 
195 ;;; Map
196 
197 ;; map ::= empty-map | non-empty-map
198 ;; empty-map ::= `0`
199 ;; non-empty-map ::= size key-element-type value-element-type (key value)+
200 
201 #|
202 Compact protocol map header (1 byte, empty map):
203 +--------+
204 |00000000|
205 +--------+
206 
207 Compact protocol map header (2+ bytes, non empty map) and key value pairs:
208 +--------+...+--------+--------+--------+...+--------+
209 | size |kkkkvvvv| key value pairs |
210 +--------+...+--------+--------+--------+...+--------+
211 |#
212 
213 (defun tcompact-encode-map ())