changelog shortlog graph tags branches files raw help

Mercurial > core / changeset: init tcompact encoders

changeset 548: b57066450cfa
parent 547: ac01164b4141
child 549: 32bd859533b3
author: Richard Westhaver <ellis@rwest.io>
date: Mon, 15 Jul 2024 20:59:04 -0400
files: lisp/lib/dat/dat.asd lisp/lib/dat/parquet/gen.lisp lisp/lib/dat/parquet/io.lisp lisp/lib/dat/parquet/obj.lisp lisp/lib/dat/parquet/tcompact.lisp lisp/lib/dat/pkg.lisp lisp/lib/dat/tests.lisp lisp/std/num/float.lisp lisp/std/num/leb128.lisp
description: init tcompact encoders
     1.1--- a/lisp/lib/dat/dat.asd	Mon Jul 15 15:18:03 2024 -0400
     1.2+++ b/lisp/lib/dat/dat.asd	Mon Jul 15 20:59:04 2024 -0400
     1.3@@ -14,6 +14,7 @@
     1.4                 ((:file "gen")
     1.5                  (:file "pkg")
     1.6                  (:file "obj")
     1.7+                 (:file "tcompact")
     1.8                  (:file "io")
     1.9                  (:file "rle")
    1.10                  (:file "proto")))
     2.1--- a/lisp/lib/dat/parquet/gen.lisp	Mon Jul 15 15:18:03 2024 -0400
     2.2+++ b/lisp/lib/dat/parquet/gen.lisp	Mon Jul 15 20:59:04 2024 -0400
     2.3@@ -100,21 +100,6 @@
     2.4             (:constructor make-parquet-struct-field (key name type-id type doc required)))
     2.5   key name type-id type doc required)
     2.6 
     2.7-(defun parquet-destruct-field (field)
     2.8-  (list (parquet-struct-field-name field)
     2.9-        (parquet-struct-field-key field)
    2.10-        (parquet-struct-field-doc field)
    2.11-        (parquet-struct-field-type-id field)
    2.12-        (parquet-struct-field-type field)
    2.13-        (parquet-struct-field-required field)))
    2.14-
    2.15-(defun parquet-destruct (struct)
    2.16-  (list (parquet-struct-name struct)
    2.17-        (parquet-struct-doc struct)
    2.18-        (parquet-struct-unionp struct)
    2.19-        (parquet-struct-exceptionp struct)
    2.20-        (mapcar #'parquet-destruct-field (parquet-struct-fields struct))))
    2.21-
    2.22 (defun parquet-json-structs () ;; name doc isException isUnion fields
    2.23   (mapcar
    2.24    (lambda (s)
    2.25@@ -212,12 +197,6 @@
    2.26   (with-package (:dat/parquet)
    2.27     (define-parquet-class parquet-struct-object () ())
    2.28     (let ((types (define-parquet-types)))
    2.29-      (export types)
    2.30-      (deftype dat/parquet::parquet-type (&optional (designator octet-vector) optional)
    2.31-        (if optional
    2.32-            (if (eql designator 'list)
    2.33-                list
    2.34-                `(or null ,designator))
    2.35-            designator)))
    2.36+      (export types))
    2.37     (export (mapcar 'class-name (define-parquet-structs)))
    2.38     (export *parquet-enums*)))
     3.1--- a/lisp/lib/dat/parquet/io.lisp	Mon Jul 15 15:18:03 2024 -0400
     3.2+++ b/lisp/lib/dat/parquet/io.lisp	Mon Jul 15 20:59:04 2024 -0400
     3.3@@ -1,6 +1,6 @@
     3.4 ;;; io.lisp --- Parquet IO
     3.5 
     3.6-;; 
     3.7+;;
     3.8 
     3.9 ;;; Code:
    3.10 (in-package :dat/parquet)
    3.11@@ -10,9 +10,47 @@
    3.12   (write-string +parquet-magic-number+ stream))
    3.13 
    3.14 (defun parquet-read-magic (stream)
    3.15-  (assert (char= #.(char +parquet-magic-number+ 0) (read-char stream)))
    3.16-  (assert (char= #.(char +parquet-magic-number+ 1) (read-char stream)))
    3.17-  (assert (char= #.(char +parquet-magic-number+ 2) (read-char stream)))
    3.18-  (assert (char= #.(char +parquet-magic-number+ 3) (read-char stream)))
    3.19-  t)
    3.20+  (assert (= #.(char-code (aref +parquet-magic-number+ 0)) (read-byte stream)))
    3.21+  (assert (= #.(char-code (aref +parquet-magic-number+ 1)) (read-byte stream)))
    3.22+  (assert (= #.(char-code (aref +parquet-magic-number+ 2)) (read-byte stream)))
    3.23+  (assert (= #.(char-code (aref +parquet-magic-number+ 3)) (read-byte stream))))
    3.24+
    3.25+(defun parquet-read-unsigned (stream)
    3.26+  (read-uleb128 stream))
    3.27+
    3.28+(defun parquet-read-signed (stream)
    3.29+  (read-leb128 stream))
    3.30+
    3.31+(defun parquet-read-boolean (stream)
    3.32+  (ecase (read-byte stream)
    3.33+    (0 nil)
    3.34+    (1 t)))
    3.35 
    3.36+(defun parquet-file-stream-p (stream)
    3.37+  "Assert the start and end of a file STREAM are the parquet magic bytes."
    3.38+  (parquet-read-magic stream)
    3.39+  ;; set position to end - 4
    3.40+  (file-position stream (- (the fixnum (file-length stream)) 4))
    3.41+  (parquet-read-magic stream))
    3.42+
    3.43+(defun parquet-read-schema-element (stream))
    3.44+(defun parquet-read-schema (stream)
    3.45+  "Read a parquet-schema which is repeated list of parquet-schema-element."
    3.46+  
    3.47+  )
    3.48+
    3.49+(defun parquet-read-file-meta-data (stream)
    3.50+  "Read a parquet-file-meta-data object from STREAM."
    3.51+  ;; version
    3.52+  (make-instance 'parquet-file-meta-data
    3.53+    :version (parquet-read-signed stream)
    3.54+    :schema (list (parquet-read-signed stream))))
    3.55+
    3.56+(defun parquet-read-footer (stream)
    3.57+  "Read the footer of parquet data in STREAM."
    3.58+  (parquet-file-stream-p stream)
    3.59+  ;; set file-position, read metadata length and magic
    3.60+  (file-position stream (- (file-length stream) 8))
    3.61+  (let ((len (parquet-read-unsigned stream)))
    3.62+    (file-position stream (- (file-length stream) 8 len))
    3.63+    (parquet-read-file-meta-data stream)))
     4.1--- a/lisp/lib/dat/parquet/obj.lisp	Mon Jul 15 15:18:03 2024 -0400
     4.2+++ b/lisp/lib/dat/parquet/obj.lisp	Mon Jul 15 20:59:04 2024 -0400
     4.3@@ -8,16 +8,17 @@
     4.4 (eval-always
     4.5   (dat/parquet/gen::load-parquet))
     4.6 
     4.7-(deftype parquet-compression-codec () `(member ,*parquet-compression-codecs*))
     4.8-
     4.9-(deftype parquet-boundary-order () `(member ,*parquet-boundary-orders*))
    4.10-
    4.11-(deftype parquet-encoding () `(member ,*parquet-encodings*))
    4.12-
    4.13-(deftype parquet-field-repetition () `(member ,*parquet-field-repetition-types*))
    4.14-
    4.15-(deftype parquet-type-designator () `(member ,*parquet-types*))
    4.16-
    4.17-(deftype parquet-converted-type-designator () `(member ,*parquet-converted-types*))
    4.18-
    4.19-(deftype parquet-page-type () `(member ,*parquet-page-types*))
    4.20+(macrolet ((def-parquet-type (name)
    4.21+             (let ((var-name (symbolicate "*" name "S*"))
    4.22+                   (name1 (symbolicate name "*")))
    4.23+               `(progn
    4.24+                  (deftype ,name () `(member ,,var-name))
    4.25+                  (defun ,name (d) (position d ,var-name :test 'eql))
    4.26+                  (defun ,name1 (n) (elt ,var-name n))))))
    4.27+  (def-parquet-type parquet-compression-codec)
    4.28+  (def-parquet-type parquet-boundary-order)
    4.29+  (def-parquet-type parquet-encoding)
    4.30+  (def-parquet-type parquet-field-repetition-type)
    4.31+  (def-parquet-type parquet-type)
    4.32+  (def-parquet-type parquet-converted-type)
    4.33+  (def-parquet-type parquet-page-type))
     5.1--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2+++ b/lisp/lib/dat/parquet/tcompact.lisp	Mon Jul 15 20:59:04 2024 -0400
     5.3@@ -0,0 +1,184 @@
     5.4+;;; tcompact.lisp --- Thrift Compact Protocol
     5.5+
     5.6+;; ref: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
     5.7+
     5.8+;;; Commentary:
     5.9+
    5.10+;; in order to encode Parquet, we need to be able to encode the Thrift Compact
    5.11+;; Protocol (TCompact). All thrift structures we've generated via parquet.json
    5.12+;; are serialized using TCompact.
    5.13+
    5.14+;; see also: https://thrift.apache.org/static/files/thrift-20070401.pdf
    5.15+
    5.16+;;; Code:
    5.17+(in-package :dat/parquet)
    5.18+
    5.19+;;; Protocol
    5.20+
    5.21+(defgeneric tcompact-field-id (self))
    5.22+(defgeneric tcompact-element-id (self))
    5.23+
    5.24+;;; Integers
    5.25+
    5.26+#|
    5.27+50399 =          11000100 11011111  (LSB)
    5.28+      =  0000011  0001001  1011111  (7-bit groups)
    5.29+      = 00000011 10001001 11011111  (add continuation bits)
    5.30+      =     0x03     0x89     0xDF  (hex)
    5.31+→ 0xDF 0x89 0x03 (write to ram LSB first)
    5.32+|#
    5.33+
    5.34+;; encoded as ULEB128. signed and unsigned bytes are encoded as single
    5.35+;; bytes. all others are coverted to int64.
    5.36+(defun zigzag (n)
    5.37+  (declare (integer n))
    5.38+  (logxor (ash n 1) (ash n -63)))
    5.39+
    5.40+(defun zagzig (n)
    5.41+  (declare (fixnum n))
    5.42+  (logxor (ash n -1) (- (logand n 1))))
    5.43+
    5.44+(defun tcompact-encode-integer (n &optional (size 8))
    5.45+  (declare (integer n))
    5.46+  (if (<= (integer-length n) 8)
    5.47+      (vector n)
    5.48+      (encode-uleb128 (zigzag n) size)))
    5.49+
    5.50+;; (zagzig (zigzag -3)) ;=-3
    5.51+
    5.52+;;; Enums
    5.53+
    5.54+;; ordinal value encoded as int32
    5.55+(defun tcompact-encode-enum (n)
    5.56+  (tcompact-encode-integer n 4))
    5.57+
    5.58+;;; Binary
    5.59+
    5.60+#|
    5.61+Binary protocol, binary data, 1+ bytes:
    5.62++--------+...+--------+--------+...+--------+
    5.63+| byte length         | bytes               |
    5.64++--------+...+--------+--------+...+--------+
    5.65+|#
    5.66+
    5.67+;; a varint followed by the bytes
    5.68+(defun tcompact-encode-octet-vector (octets)
    5.69+  (concatenate 'octet-vector
    5.70+               (tcompact-encode-integer (length octets))
    5.71+               octets))
    5.72+
    5.73+;;; String
    5.74+
    5.75+;; encoded as UTF-8 bytes without null-termination
    5.76+(defun tcompact-encode-string (string)
    5.77+  (sb-ext:string-to-octets string :external-format :utf-8))
    5.78+
    5.79+;;; UUID
    5.80+
    5.81+;; always 16 bytes, no length header
    5.82+(defun tcompact-encode-uuid (uuid)
    5.83+  (declare (obj/uuid:uuid uuid))
    5.84+  (obj/uuid:uuid-to-octet-vector uuid))
    5.85+
    5.86+;;; Structs
    5.87+
    5.88+;; struct        ::= ( field-header field-value )* stop-field
    5.89+;; field-header  ::= field-type field-id
    5.90+
    5.91+#|
    5.92+Compact protocol field header (short form) and field value:
    5.93++--------+--------+...+--------+
    5.94+|ddddtttt| field value         |
    5.95++--------+--------+...+--------+
    5.96+
    5.97+Compact protocol field header (1 to 3 bytes, long form) and field value:
    5.98++--------+--------+...+--------+--------+...+--------+
    5.99+|0000tttt| field id            | field value         |
   5.100++--------+--------+...+--------+--------+...+--------+
   5.101+
   5.102+Compact protocol stop field:
   5.103++--------+
   5.104+|00000000|
   5.105++--------+
   5.106+|#
   5.107+
   5.108+;; sequences of zero or more 'fields' followed by a stop field.
   5.109+
   5.110+;; each field starts with a field header and is followed by the encoded field
   5.111+;; value.
   5.112+
   5.113+;; the field-id is represented in Lisp via OBJ/ID.
   5.114+
   5.115+;; note that it is possible to handle unknown fields while decoding. in the
   5.116+;; usual case these are ignored.
   5.117+
   5.118+(declaim ((unsigned-byte 8) +tcompact-stop-field+))
   5.119+(defconstant +tcompact-stop-field+ 0)
   5.120+(deftype tcompact-field-id () '(integer 0 32767))
   5.121+(deftype tcompact-field-id-delta () '(unsigned-byte 4))
   5.122+(deftype tcompact-field-type-id () '(unsigned-byte 4))
   5.123+
   5.124+(defvar *tcompact-field-types*
   5.125+  #(:true :false :i8 :i16 :i32 :i64 :double :binary :list :set :map :struct :uuid))
   5.126+(defun tcompact-field-type-id (n) (1+ (aref *tcompact-field-types* n)))
   5.127+
   5.128+;; (ldb (byte 4 0) n)
   5.129+(defun tcompact-encode-field-header-short (id-delta type-id)
   5.130+  (dpb type-id (byte 4 4)
   5.131+       (dpb id-delta (byte 4 0) 0)))
   5.132+
   5.133+(defun tcompact-encode-field-id (id)
   5.134+  (tcompact-encode-integer id))
   5.135+
   5.136+(defun tcompact-encode-field-header (field)
   5.137+  (concatenate 'octet-vector
   5.138+               (tcompact-encode-field-header-short 0 (tcompact-field-type-id field))
   5.139+               (tocmpact-encode-field-id (tcompact-field-id field))))
   5.140+
   5.141+(defun tcompact-encode-field-value (field))
   5.142+
   5.143+(defun tcompact-encode-struct (struct))
   5.144+  ;; field-id-delta = current-field-id - previous-field-id
   5.145+
   5.146+;;; List and Set
   5.147+
   5.148+#|
   5.149+Compact protocol list header (1 byte, short form) and elements:
   5.150++--------+--------+...+--------+
   5.151+|sssstttt| elements            |
   5.152++--------+--------+...+--------+
   5.153+
   5.154+Compact protocol list header (2+ bytes, long form) and elements:
   5.155++--------+--------+...+--------+--------+...+--------+
   5.156+|1111tttt| size                | elements            |
   5.157++--------+--------+...+--------+--------+...+--------+
   5.158+|#
   5.159+
   5.160+(deftype tcompact-element-type-id () '(unsigned-byte 4))
   5.161+
   5.162+(defvar *tcompact-element-types*
   5.163+  #(:bool :i8 :i16 :i32 :i64 :double :binary :list :set :map :struct :uuid))
   5.164+
   5.165+(defun tcompact-element-type-id (n) (+ (aref *tcompact-element-types* n) 2))
   5.166+
   5.167+(defun tcompact-encode-list ())
   5.168+
   5.169+;;; Map
   5.170+
   5.171+;; map           ::= empty-map | non-empty-map
   5.172+;; empty-map     ::= `0`
   5.173+;; non-empty-map ::= size key-element-type value-element-type (key value)+
   5.174+
   5.175+#|
   5.176+Compact protocol map header (1 byte, empty map):
   5.177++--------+
   5.178+|00000000|
   5.179++--------+
   5.180+
   5.181+Compact protocol map header (2+ bytes, non empty map) and key value pairs:
   5.182++--------+...+--------+--------+--------+...+--------+
   5.183+| size                |kkkkvvvv| key value pairs     |
   5.184++--------+...+--------+--------+--------+...+--------+
   5.185+|#
   5.186+
   5.187+(defun tcompact-encode-map ())
     6.1--- a/lisp/lib/dat/pkg.lisp	Mon Jul 15 15:18:03 2024 -0400
     6.2+++ b/lisp/lib/dat/pkg.lisp	Mon Jul 15 20:59:04 2024 -0400
     6.3@@ -234,7 +234,7 @@
     6.4   (:export))
     6.5 
     6.6 (defpackage :dat/parquet
     6.7-  (:use :cl :std :dat/proto :dat/json)
     6.8+  (:use :cl :std :obj/id :dat/proto :dat/json)
     6.9   (:export
    6.10    :parquet-object
    6.11    :parquet-enum-object
     7.1--- a/lisp/lib/dat/tests.lisp	Mon Jul 15 15:18:03 2024 -0400
     7.2+++ b/lisp/lib/dat/tests.lisp	Mon Jul 15 20:59:04 2024 -0400
     7.3@@ -146,10 +146,12 @@
     7.4     (with-output-to-string (s)
     7.5       (is (write-sxp-stream f s)))))
     7.6 
     7.7+;; see also: https://github.com/apache/parquet-testing/blob/master/data/README.md
     7.8 (deftest parquet-basic ()
     7.9-  (is
    7.10-   (with-input-from-string
    7.11-       (s
    7.12-        (with-output-to-string (s)
    7.13-          (dat/parquet::parquet-write-magic s)))
    7.14-     (dat/parquet::parquet-read-magic s))))
    7.15+  (with-open-file (st "/home/ellis/comp/core/.stash/datapage_v1-uncompressed-checksum.parquet" :element-type '(unsigned-byte 8))
    7.16+    (let ((footer (dat/parquet::parquet-read-footer st)))
    7.17+      (is (typep footer
    7.18+                 'dat/parquet::parquet-file-meta-data))
    7.19+      (print (slot-value footer 'dat/parquet::schema))
    7.20+      (print (file-position st))
    7.21+      (print (file-length st)))))
     8.1--- a/lisp/std/num/float.lisp	Mon Jul 15 15:18:03 2024 -0400
     8.2+++ b/lisp/std/num/float.lisp	Mon Jul 15 20:59:04 2024 -0400
     8.3@@ -10,7 +10,7 @@
     8.4 ;; Note that the physical encoding is always represented as a fixnum.
     8.5 
     8.6 ;; To read/write from a file you must pass through a fixnum repr to bytes,
     8.7-;; usually using octets-to-integer or integer-to-octets. There are also
     8.8+;; usually using octets-to-integer or integer-to-octets.
     8.9 
    8.10 ;;; Code:
    8.11 
     9.1--- a/lisp/std/num/leb128.lisp	Mon Jul 15 15:18:03 2024 -0400
     9.2+++ b/lisp/std/num/leb128.lisp	Mon Jul 15 20:59:04 2024 -0400
     9.3@@ -17,7 +17,6 @@
     9.4   "Encode an integer of arbitrary length into a leb128 unsigned-8 buffer"
     9.5   (let ((more t) (curr) (in 0) (int (make-array
     9.6                                      4
     9.7-                                     :adjustable t
     9.8                                      :fill-pointer 0
     9.9                                      :element-type '(unsigned-byte 8)))) ;(neg (< i 0))
    9.10     (declare (fixnum i in))
    9.11@@ -68,14 +67,16 @@
    9.12                (return-from decode-leb128 (values (logior result (the fixnum (ash (lognot 0) shift))) counter))
    9.13                (return-from decode-leb128 (values result counter)))))))
    9.14 
    9.15-(declaim (ftype (function (fixnum) (simple-array (unsigned-byte 8))) encode-uleb128))
    9.16-(defun encode-uleb128 (int)
    9.17-  "Encode an integer INT as a ULEB128 byte array."
    9.18+(declaim (ftype (function (fixnum &optional (unsigned-byte 8)) (simple-array (unsigned-byte 8))) encode-uleb128))
    9.19+(defun encode-uleb128 (int &optional size)
    9.20+  "Encode an integer INT as a ULEB128 byte array with SIZE (in bytes)."
    9.21   (declare (fixnum int))
    9.22   (let ((more t) (curr) (in 0) (ret (make-array
    9.23-                                     (if (zerop int)
    9.24-                                         1
    9.25-                                         (ceiling  (/ (log (+ int 1) 2) 7)))
    9.26+                                     (if size
    9.27+                                         size
    9.28+                                         (if (zerop int)
    9.29+                                             1
    9.30+                                             (ceiling  (/ (log (+ int 1) 2) 7))))
    9.31                                      :element-type '(unsigned-byte 8)))) ;(neg (< int 0))
    9.32     (loop while more do
    9.33          (setf curr (logand int #x7f))
    9.34@@ -85,14 +86,13 @@
    9.35              (setf curr (logior curr #x80)))
    9.36          (setf (aref ret in) curr)
    9.37          (incf in))
    9.38-    (coerce ret 'simple-array)))
    9.39+    ret))
    9.40 
    9.41-(declaim (ftype (function ((array (unsigned-byte 8)) &optional t) fixnum) decode-uleb128))
    9.42+(declaim (ftype (function ((vector unsigned-byte) &optional fixnum) fixnum) decode-uleb128))
    9.43 (defun decode-uleb128 (bits &optional (start 0))
    9.44   "Decode an unsigned integer from ULEB128 byte array."
    9.45   (let ((result 0) (shift 0) (curr) (counter 0))
    9.46-    (declare (fixnum result shift counter start)
    9.47-             ((array (unsigned-byte 8)) bits))
    9.48+    (declare (fixnum result shift counter start))
    9.49     (loop do 
    9.50          (setf curr (aref bits start))
    9.51          (setf start (+ 1 start))