changelog shortlog graph tags branches changeset files file revisions raw help

Mercurial > core / annotate lisp/std/num/leb128.lisp

changeset 698: 96958d3eb5b0
parent: 8b10eabe89dd
author: Richard Westhaver <ellis@rwest.io>
date: Fri, 04 Oct 2024 22:04:59 -0400
permissions: -rw-r--r--
description: fixes
544
ec1d4d544c36 parquet expansion, init leb128, add little-endian octet encoders
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
1
 ;;; leb128.lisp --- Little-Endian Base 128 Variable Encoding
ec1d4d544c36 parquet expansion, init leb128, add little-endian octet encoders
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
2
 
547
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
3
 ;; (U)LEB128 encoders based on CL-LEB128
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
4
 
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
5
 ;; see https://github.com/mahirvaluj/cl-leb128/blob/main/leb128.lisp
544
ec1d4d544c36 parquet expansion, init leb128, add little-endian octet encoders
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
6
 
ec1d4d544c36 parquet expansion, init leb128, add little-endian octet encoders
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
7
 ;;; Commentary:
ec1d4d544c36 parquet expansion, init leb128, add little-endian octet encoders
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
8
 
ec1d4d544c36 parquet expansion, init leb128, add little-endian octet encoders
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
9
 ;; ref: https://en.wikipedia.org/wiki/LEB128
ec1d4d544c36 parquet expansion, init leb128, add little-endian octet encoders
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
10
 ;; opt: https://arxiv.org/abs/1503.07387 VByte
ec1d4d544c36 parquet expansion, init leb128, add little-endian octet encoders
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
11
 ;; opt: https://arxiv.org/pdf/1709.08990 VByte streaming
ec1d4d544c36 parquet expansion, init leb128, add little-endian octet encoders
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
12
 
ec1d4d544c36 parquet expansion, init leb128, add little-endian octet encoders
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
13
 ;;; Code:
ec1d4d544c36 parquet expansion, init leb128, add little-endian octet encoders
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
14
 (in-package :std/num)
546
8200daaf8199 encode/decode leb128
Richard Westhaver <ellis@rwest.io>
parents: 544
diff changeset
15
 
547
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
16
 (defun encode-leb128 (i)
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
17
   "Encode an integer of arbitrary length into a leb128 unsigned-8 buffer"
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
18
   (let ((more t) (curr) (in 0) (int (make-array
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
19
                                      4
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
20
                                      :fill-pointer 0
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
21
                                      :element-type '(unsigned-byte 8)))) ;(neg (< i 0))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
22
     (declare (fixnum i in))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
23
     (loop while more do
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
24
       (setf curr (logand i #x7f))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
25
       (setf i (ash i -7))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
26
       (if (or (and (= i 0)  (= (logand curr #x40) 0))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
27
               (and (= i -1) (= (logand curr #x40) 64)))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
28
           (setf more nil)
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
29
           (setf curr (logior curr #x80)))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
30
       (vector-push-extend curr int)
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
31
       (incf in))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
32
     (let ((ret (make-array (length int) :element-type '(unsigned-byte 8) :initial-contents int)))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
33
       ret)))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
34
 
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
35
 (defun read-leb128 (s &optional (start 0))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
36
   "decode signed integer from stream. Returns (values decoded-integer
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
37
 num-bytes-consumed)"
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
38
   (declare (fixnum start))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
39
   (when (not (= start 0))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
40
     (loop for i from 0 upto start do (read-byte s)))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
41
   (let ((result 0) (shift 0) (curr) (counter 0))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
42
     (declare (fixnum result shift counter))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
43
     (loop do 
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
44
          (setf curr (read-byte s))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
45
          (setf result (logior result (the fixnum (ash (logand curr #x7f) shift))))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
46
          (setf shift (+ 7 shift))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
47
          (incf counter)
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
48
          (when (= 0 (logand curr #x80))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
49
            (if (= 64 (logand curr #x40))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
50
                (return-from read-leb128 (values (logior result (the fixnum (ash (lognot 0) shift))) counter))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
51
                (return-from read-leb128 (values result counter)))))))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
52
 
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
53
 (defun decode-leb128 (buf &optional (start 0))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
54
   "decode signed integer from buffer. Returns (values decoded-integer
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
55
 num-bytes-consumed)"
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
56
   (declare (fixnum start) (vector buf))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
57
   (let ((result 0) (shift 0) (curr 0) (counter 0))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
58
     (declare (fixnum result shift counter))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
59
     (loop do 
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
60
          (setf curr (the (unsigned-byte 8) (aref buf start)))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
61
          (setf start (+ 1 start))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
62
          (setf result (logior result (the fixnum (ash (logand curr #x7f) shift))))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
63
          (setf shift (+ 7 shift))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
64
          (incf counter)
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
65
          (when (= 0 (logand curr #x80))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
66
            (if (= 64 (logand curr #x40))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
67
                (return-from decode-leb128 (values (logior result (the fixnum (ash (lognot 0) shift))) counter))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
68
                (return-from decode-leb128 (values result counter)))))))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
69
 
550
4d34907c69eb more work on tcompact/thrift, fixed type info in parquet-struct-objects
Richard Westhaver <ellis@rwest.io>
parents: 548
diff changeset
70
 (declaim (ftype (function (integer &optional (unsigned-byte 8)) (array (unsigned-byte 8))) encode-uleb128))
548
b57066450cfa init tcompact encoders
Richard Westhaver <ellis@rwest.io>
parents: 547
diff changeset
71
 (defun encode-uleb128 (int &optional size)
b57066450cfa init tcompact encoders
Richard Westhaver <ellis@rwest.io>
parents: 547
diff changeset
72
   "Encode an integer INT as a ULEB128 byte array with SIZE (in bytes)."
550
4d34907c69eb more work on tcompact/thrift, fixed type info in parquet-struct-objects
Richard Westhaver <ellis@rwest.io>
parents: 548
diff changeset
73
   (declare (integer int))
546
8200daaf8199 encode/decode leb128
Richard Westhaver <ellis@rwest.io>
parents: 544
diff changeset
74
   (let ((more t) (curr) (in 0) (ret (make-array
548
b57066450cfa init tcompact encoders
Richard Westhaver <ellis@rwest.io>
parents: 547
diff changeset
75
                                      (if size
b57066450cfa init tcompact encoders
Richard Westhaver <ellis@rwest.io>
parents: 547
diff changeset
76
                                          size
b57066450cfa init tcompact encoders
Richard Westhaver <ellis@rwest.io>
parents: 547
diff changeset
77
                                          (if (zerop int)
b57066450cfa init tcompact encoders
Richard Westhaver <ellis@rwest.io>
parents: 547
diff changeset
78
                                              1
b57066450cfa init tcompact encoders
Richard Westhaver <ellis@rwest.io>
parents: 547
diff changeset
79
                                              (ceiling  (/ (log (+ int 1) 2) 7))))
546
8200daaf8199 encode/decode leb128
Richard Westhaver <ellis@rwest.io>
parents: 544
diff changeset
80
                                      :element-type '(unsigned-byte 8)))) ;(neg (< int 0))
8200daaf8199 encode/decode leb128
Richard Westhaver <ellis@rwest.io>
parents: 544
diff changeset
81
     (loop while more do
8200daaf8199 encode/decode leb128
Richard Westhaver <ellis@rwest.io>
parents: 544
diff changeset
82
          (setf curr (logand int #x7f))
8200daaf8199 encode/decode leb128
Richard Westhaver <ellis@rwest.io>
parents: 544
diff changeset
83
          (setf int (ash int -7))
8200daaf8199 encode/decode leb128
Richard Westhaver <ellis@rwest.io>
parents: 544
diff changeset
84
          (if (= int 0)
8200daaf8199 encode/decode leb128
Richard Westhaver <ellis@rwest.io>
parents: 544
diff changeset
85
              (setf more nil)
8200daaf8199 encode/decode leb128
Richard Westhaver <ellis@rwest.io>
parents: 544
diff changeset
86
              (setf curr (logior curr #x80)))
8200daaf8199 encode/decode leb128
Richard Westhaver <ellis@rwest.io>
parents: 544
diff changeset
87
          (setf (aref ret in) curr)
8200daaf8199 encode/decode leb128
Richard Westhaver <ellis@rwest.io>
parents: 544
diff changeset
88
          (incf in))
548
b57066450cfa init tcompact encoders
Richard Westhaver <ellis@rwest.io>
parents: 547
diff changeset
89
     ret))
546
8200daaf8199 encode/decode leb128
Richard Westhaver <ellis@rwest.io>
parents: 544
diff changeset
90
 
563
8b10eabe89dd std/tests, clap tweaks
Richard Westhaver <ellis@rwest.io>
parents: 550
diff changeset
91
 (declaim (ftype (function (vector &optional t) integer) decode-uleb128))
547
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
92
 (defun decode-uleb128 (bits &optional (start 0))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
93
   "Decode an unsigned integer from ULEB128 byte array."
546
8200daaf8199 encode/decode leb128
Richard Westhaver <ellis@rwest.io>
parents: 544
diff changeset
94
   (let ((result 0) (shift 0) (curr) (counter 0))
550
4d34907c69eb more work on tcompact/thrift, fixed type info in parquet-struct-objects
Richard Westhaver <ellis@rwest.io>
parents: 548
diff changeset
95
     (declare (fixnum shift counter))
546
8200daaf8199 encode/decode leb128
Richard Westhaver <ellis@rwest.io>
parents: 544
diff changeset
96
     (loop do 
547
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
97
          (setf curr (aref bits start))
546
8200daaf8199 encode/decode leb128
Richard Westhaver <ellis@rwest.io>
parents: 544
diff changeset
98
          (setf start (+ 1 start))
550
4d34907c69eb more work on tcompact/thrift, fixed type info in parquet-struct-objects
Richard Westhaver <ellis@rwest.io>
parents: 548
diff changeset
99
          (setf result (logior result (ash (logand curr #x7f) shift)))
546
8200daaf8199 encode/decode leb128
Richard Westhaver <ellis@rwest.io>
parents: 544
diff changeset
100
          (setf shift (+ 7 shift))
8200daaf8199 encode/decode leb128
Richard Westhaver <ellis@rwest.io>
parents: 544
diff changeset
101
          (incf counter)
8200daaf8199 encode/decode leb128
Richard Westhaver <ellis@rwest.io>
parents: 544
diff changeset
102
          (when (= 0 (logand curr #x80))
547
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
103
            (return-from decode-uleb128 (values result counter))))))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
104
 
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
105
 (defun read-uleb128 (s &optional (start 0))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
106
   "Decode an arbitrarily large unsigned integer from stream. Skip
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
107
 START number bytes. Return (values integer-decoded
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
108
 num-bytes-consumed)"
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
109
   (declare (fixnum start))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
110
   (when (not (= start 0))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
111
     (loop for i from 0 upto start do (read-byte s)))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
112
   (let ((result 0) (shift 0) (curr) (counter 0))
550
4d34907c69eb more work on tcompact/thrift, fixed type info in parquet-struct-objects
Richard Westhaver <ellis@rwest.io>
parents: 548
diff changeset
113
     (declare (fixnum shift counter))
547
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
114
     (loop do 
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
115
          (setf curr (read-byte s))
550
4d34907c69eb more work on tcompact/thrift, fixed type info in parquet-struct-objects
Richard Westhaver <ellis@rwest.io>
parents: 548
diff changeset
116
          (setf result (logior result (ash (logand curr #x7f) shift)))
547
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
117
          (setf shift (+ 7 shift))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
118
          (incf counter)
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
119
          (when (= 0 (logand curr #x80))
ac01164b4141 leb128 tests
Richard Westhaver <ellis@rwest.io>
parents: 546
diff changeset
120
            (return-from read-uleb128 (values result counter))))))