changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / lisp/lib/dat/parquet/parquet.lisp

changeset 543: b88bd4b0a039
parent: 10c4bb778030
author: Richard Westhaver <ellis@rwest.io>
date: Sat, 13 Jul 2024 00:03:13 -0400
permissions: -rw-r--r--
description: tweaks
1 ;;; parquet.lisp --- Apache Parquet
2 
3 ;; Common Lisp implementation of Apache Parquet
4 
5 ;;; Commentary:
6 
7 #|
8 https://github.com/apache/parquet-format
9 https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
10 https://github.com/apache/parquet-testing
11 https://github.com/apache/parquet-java
12 https://github.com/apache/arrow-rs
13 
14 https://thrift.apache.org/docs/types
15 |#
16 
17 #|
18  4-byte magic number "PAR1"
19  <Column 1 Chunk 1>
20  <Column 2 Chunk 1>
21  ...
22  <Column N Chunk 1>
23  <Column 1 Chunk 2>
24  <Column 2 Chunk 2>
25  ...
26  <Column N Chunk 2>
27  ...
28  <Column 1 Chunk M>
29  <Column 2 Chunk M>
30  ...
31  <Column N Chunk M>
32  File Metadata
33  4-byte length in bytes of file metadata (little endian)
34  4-byte magic number "PAR1"
35 |#
36 
37 ;; In this package we're being as lazy as possible. To generate our own
38 ;; encoder/decoder methods we depend on the file parquet.thrift in the
39 ;; parquet-format repo above. The core skelfile includes a script to download
40 ;; it and convert it to parquet.json (requires the thirft cli tool). We then
41 ;; decode it with DAT/JSON and generate lisp classes, and types.
42 
43 ;;
44 ;;; Code:
45 (in-package :dat/parquet)
46 (eval-always
47  (dat/parquet/gen::load-parquet))
48 
49 (defgeneric parquet-read (value &optional stream))
50 (defgeneric parquet-write (value &optional stream))
51 
52 ;; HACK 2024-07-12:
53 (define-bitfield parquet-compression-codec
54  (uncompressed boolean)
55  (snappy boolean)
56  (gzip boolean)
57  (lzo boolean)
58  (brotli boolean)
59  (lz4 boolean)
60  (zstd boolean)
61  (lz4-raw boolean))
62 
63 ;;; Read/Write
64 (define-constant +parquet-magic-number+ "PAR1" :test 'equal)
65 
66 (defconstant +default-parquet-page-size+ (* 8 1024)) ;; 8kb
67 (defconstant +default-parquet-row-group-size (expt 1024 3)) ;; 1gb
68 
69 (defvar *parquet-creator* "parquet-cl version 0.1.0")
70 
71 (defun parquet-write-magic (stream)
72  (write-string +parquet-magic-number+ stream))
73 
74 (defun parquet-read-magic (stream)
75  (assert (char= #.(char +parquet-magic-number+ 0) (read-char stream)))
76  (assert (char= #.(char +parquet-magic-number+ 1) (read-char stream)))
77  (assert (char= #.(char +parquet-magic-number+ 2) (read-char stream)))
78  (assert (char= #.(char +parquet-magic-number+ 3) (read-char stream)))
79  t)
80 
81 (defmethod parquet-write ((value (eql t)) &optional stream)
82  "Encode a parquet boolean true value."
83  (declare (ignore value))
84  (write-byte 1 stream))
85 
86 (defmethod parquet-write ((value (eql nil)) &optional stream)
87  "Encode a parquet boolean false value."
88  (declare (ignore value))
89  (write-byte 0 stream))
90 
91 (defmethod parquet-write ((value string) &optional stream))
92 
93 ;;; Encode/Decode
94 (defun parquet-encode (value &optional stream)
95  "Encode a Lisp value and write it to a parquet stream."
96  (parquet-write value stream))
97 
98 (defun parquet-decode (string &key (start 0) end)
99  "Convert a PARQUET string into a Lisp object."
100  (with-input-from-string (stream string :start start :end end)
101  (values (parquet-read stream)
102  (file-position stream))))