From 919091615b1093c2096129646b21d9103193a30b Mon Sep 17 00:00:00 2001 From: Jack O'Connor Date: Sun, 18 Aug 2024 16:03:29 -0700 Subject: delete the guts crate for now I haven't developed this experimental code any further since I pushed it, and it's not good for unused code to sit around. If I ever get back to this project I might revert this deletion. --- .github/workflows/ci.yml | 11 - rust/guts/Cargo.toml | 18 - rust/guts/readme.md | 80 ---- rust/guts/src/lib.rs | 1000 --------------------------------------------- rust/guts/src/portable.rs | 262 ------------ rust/guts/src/test.rs | 523 ------------------------ 6 files changed, 1894 deletions(-) delete mode 100644 rust/guts/Cargo.toml delete mode 100644 rust/guts/readme.md delete mode 100644 rust/guts/src/lib.rs delete mode 100644 rust/guts/src/portable.rs delete mode 100644 rust/guts/src/test.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b031602..e0dcd33 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -151,17 +151,6 @@ jobs: run: cargo test working-directory: ./reference_impl - # the new guts crate - - name: guts test - run: cargo test --all-features - working-directory: ./rust/guts - - name: guts no_std build - run: cargo build --no-default-features - working-directory: ./rust/guts - - name: guts no_std test # note that rust/guts/src/test.rs still uses libstd - run: cargo test --no-default-features - working-directory: ./rust/guts - b3sum_tests: name: b3sum ${{ matrix.target.name }} ${{ matrix.channel }} runs-on: ${{ matrix.target.os }} diff --git a/rust/guts/Cargo.toml b/rust/guts/Cargo.toml deleted file mode 100644 index ebcf77f..0000000 --- a/rust/guts/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "blake3_guts" -version = "0.0.0" -authors = ["Jack O'Connor ", "Samuel Neves"] -description = "low-level building blocks for the BLAKE3 hash function" -repository = "https://github.com/BLAKE3-team/BLAKE3" -license = "CC0-1.0 OR Apache-2.0" -documentation = "https://docs.rs/blake3_guts" -readme = "readme.md" -edition = "2021" - -[dev-dependencies] -hex = "0.4.3" -reference_impl = { path = "../../reference_impl" } - -[features] -default = ["std"] -std = [] diff --git a/rust/guts/readme.md b/rust/guts/readme.md deleted file mode 100644 index 4957816..0000000 --- a/rust/guts/readme.md +++ /dev/null @@ -1,80 +0,0 @@ -# The BLAKE3 Guts API - -## Introduction - -This [`blake3_guts`](https://crates.io/crates/blake3_guts) sub-crate contains -low-level, high-performance, platform-specific implementations of the BLAKE3 -compression function. This API is complicated and unsafe, and this crate will -never have a stable release. Most callers should instead use the -[`blake3`](https://crates.io/crates/blake3) crate, which will eventually depend -on this one internally. - -The code you see here (as of January 2024) is an early stage of a large planned -refactor. The motivation for this refactor is a couple of missing features in -both the Rust and C implementations: - -- The output side - ([`OutputReader`](https://docs.rs/blake3/latest/blake3/struct.OutputReader.html) - in Rust) doesn't take advantage of the most important SIMD optimizations that - compute multiple blocks in parallel. This blocks any project that wants to - use the BLAKE3 XOF as a stream cipher - ([[1]](https://github.com/oconnor663/bessie), - [[2]](https://github.com/oconnor663/blake3_aead)). -- Low-level callers like [Bao](https://github.com/oconnor663/bao) that need - interior nodes of the tree also don't get those SIMD optimizations. They have - to use a slow, minimalistic, unstable, doc-hidden module [(also called - `guts`)](https://github.com/BLAKE3-team/BLAKE3/blob/master/src/guts.rs). - -The difficulty with adding those features is that they require changes to all -of our optimized assembly and C intrinsics code. That's a couple dozen -different files that are large, platform-specific, difficult to understand, and -full of duplicated code. The higher-level Rust and C implementations of BLAKE3 -both depend on these files and will need to coordinate changes. - -At the same time, it won't be long before we add support for more platforms: - -- RISCV vector extensions -- ARM SVE -- WebAssembly SIMD - -It's important to get this refactor done before new platforms make it even -harder to do. - -## The private guts API - -This is the API that each platform reimplements, so we want it to be as simple -as possible apart from the high-performance work it needs to do. It's -completely `unsafe`, and inputs and outputs are raw pointers that are allowed -to alias (this matters for `hash_parents`, see below). - -- `degree` -- `compress` - - The single compression function, for short inputs and odd-length tails. -- `hash_chunks` -- `hash_parents` -- `xof` -- `xof_xor` - - As `xof` but XOR'ing the result into the output buffer. -- `universal_hash` - - This is a new construction specifically to support - [BLAKE3-AEAD](https://github.com/oconnor663/blake3_aead). Some - implementations might just stub it out with portable code. - -## The public guts API - -This is the API that this crate exposes to callers, i.e. to the main `blake3` -crate. It's a thin, portable layer on top of the private API above. The Rust -version of this API is memory-safe. - -- `degree` -- `compress` -- `hash_chunks` -- `hash_parents` - - This handles most levels of the tree, where we keep hashing SIMD_DEGREE - parents at a time. -- `reduce_parents` - - This uses the same `hash_parents` private API, but it handles the top - levels of the tree where we reduce in-place to the root parent node. -- `xof` -- `xof_xor` -- `universal_hash` diff --git a/rust/guts/src/lib.rs b/rust/guts/src/lib.rs deleted file mode 100644 index e9b4914..0000000 --- a/rust/guts/src/lib.rs +++ /dev/null @@ -1,1000 +0,0 @@ -//! # The BLAKE3 Guts API -//! -//! See `readme.md`. -//! -//! The main entrypoint into this crate is [`DETECTED_IMPL`], which is a global [`Implementation`] -//! that atomically initializes itself the first time you use it. -//! -//! # Example -//! -//! ```rust -//! use blake3_guts::{TransposedVectors, DETECTED_IMPL, IV_BYTES, PARENT, ROOT}; -//! -//! // Hash an input of exactly two chunks. -//! let input = [0u8; 2048]; -//! let mut outputs = TransposedVectors::new(); -//! let (left_outputs, _) = DETECTED_IMPL.split_transposed_vectors(&mut outputs); -//! DETECTED_IMPL.hash_chunks( -//! &input, -//! &IV_BYTES, -//! 0, // counter -//! 0, // flags -//! left_outputs, -//! ); -//! let root_node = outputs.extract_parent_node(0); -//! let hash = DETECTED_IMPL.compress( -//! &root_node, -//! 64, // block_len -//! &IV_BYTES, -//! 0, // counter -//! PARENT | ROOT, -//! ); -//! -//! // Compute the same hash using the reference implementation. -//! let mut reference_hasher = reference_impl::Hasher::new(); -//! reference_hasher.update(&input); -//! let mut expected_hash = [0u8; 32]; -//! reference_hasher.finalize(&mut expected_hash); -//! -//! assert_eq!(hash, expected_hash); -//! ``` - -// Tests always require libstd. -#![cfg_attr(all(not(feature = "std"), not(test)), no_std)] - -use core::cmp; -use core::marker::PhantomData; -use core::mem; -use core::ptr; -use core::sync::atomic::{AtomicPtr, Ordering::Relaxed}; - -pub mod portable; - -#[cfg(test)] -mod test; - -pub const OUT_LEN: usize = 32; -pub const BLOCK_LEN: usize = 64; -pub const CHUNK_LEN: usize = 1024; -pub const WORD_LEN: usize = 4; -pub const UNIVERSAL_HASH_LEN: usize = 16; - -pub const CHUNK_START: u32 = 1 << 0; -pub const CHUNK_END: u32 = 1 << 1; -pub const PARENT: u32 = 1 << 2; -pub const ROOT: u32 = 1 << 3; -pub const KEYED_HASH: u32 = 1 << 4; -pub const DERIVE_KEY_CONTEXT: u32 = 1 << 5; -pub const DERIVE_KEY_MATERIAL: u32 = 1 << 6; - -pub const IV: CVWords = [ - 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, -]; -pub const IV_BYTES: CVBytes = le_bytes_from_words_32(&IV); - -pub const MSG_SCHEDULE: [[usize; 16]; 7] = [ - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], - [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8], - [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1], - [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6], - [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4], - [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7], - [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13], -]; - -// never less than 2 -pub const MAX_SIMD_DEGREE: usize = 2; - -pub type CVBytes = [u8; 32]; -pub type CVWords = [u32; 8]; -pub type BlockBytes = [u8; 64]; -pub type BlockWords = [u32; 16]; - -pub static DETECTED_IMPL: Implementation = Implementation::new( - degree_init, - compress_init, - hash_chunks_init, - hash_parents_init, - xof_init, - xof_xor_init, - universal_hash_init, -); - -fn detect() -> Implementation { - portable::implementation() -} - -fn init_detected_impl() { - let detected = detect(); - - DETECTED_IMPL - .degree_ptr - .store(detected.degree_ptr.load(Relaxed), Relaxed); - DETECTED_IMPL - .compress_ptr - .store(detected.compress_ptr.load(Relaxed), Relaxed); - DETECTED_IMPL - .hash_chunks_ptr - .store(detected.hash_chunks_ptr.load(Relaxed), Relaxed); - DETECTED_IMPL - .hash_parents_ptr - .store(detected.hash_parents_ptr.load(Relaxed), Relaxed); - DETECTED_IMPL - .xof_ptr - .store(detected.xof_ptr.load(Relaxed), Relaxed); - DETECTED_IMPL - .xof_xor_ptr - .store(detected.xof_xor_ptr.load(Relaxed), Relaxed); - DETECTED_IMPL - .universal_hash_ptr - .store(detected.universal_hash_ptr.load(Relaxed), Relaxed); -} - -pub struct Implementation { - degree_ptr: AtomicPtr<()>, - compress_ptr: AtomicPtr<()>, - hash_chunks_ptr: AtomicPtr<()>, - hash_parents_ptr: AtomicPtr<()>, - xof_ptr: AtomicPtr<()>, - xof_xor_ptr: AtomicPtr<()>, - universal_hash_ptr: AtomicPtr<()>, -} - -impl Implementation { - const fn new( - degree_fn: DegreeFn, - compress_fn: CompressFn, - hash_chunks_fn: HashChunksFn, - hash_parents_fn: HashParentsFn, - xof_fn: XofFn, - xof_xor_fn: XofFn, - universal_hash_fn: UniversalHashFn, - ) -> Self { - Self { - degree_ptr: AtomicPtr::new(degree_fn as *mut ()), - compress_ptr: AtomicPtr::new(compress_fn as *mut ()), - hash_chunks_ptr: AtomicPtr::new(hash_chunks_fn as *mut ()), - hash_parents_ptr: AtomicPtr::new(hash_parents_fn as *mut ()), - xof_ptr: AtomicPtr::new(xof_fn as *mut ()), - xof_xor_ptr: AtomicPtr::new(xof_xor_fn as *mut ()), - universal_hash_ptr: AtomicPtr::new(universal_hash_fn as *mut ()), - } - } - - #[inline] - fn degree_fn(&self) -> DegreeFn { - unsafe { mem::transmute(self.degree_ptr.load(Relaxed)) } - } - - #[inline] - pub fn degree(&self) -> usize { - let degree = unsafe { self.degree_fn()() }; - debug_assert!(degree >= 2); - debug_assert!(degree <= MAX_SIMD_DEGREE); - debug_assert_eq!(1, degree.count_ones(), "power of 2"); - degree - } - - #[inline] - pub fn split_transposed_vectors<'v>( - &self, - vectors: &'v mut TransposedVectors, - ) -> (TransposedSplit<'v>, TransposedSplit<'v>) { - unsafe { vectors.split(self.degree()) } - } - - #[inline] - fn compress_fn(&self) -> CompressFn { - unsafe { mem::transmute(self.compress_ptr.load(Relaxed)) } - } - - #[inline] - pub fn compress( - &self, - block: &BlockBytes, - block_len: u32, - cv: &CVBytes, - counter: u64, - flags: u32, - ) -> CVBytes { - let mut out = [0u8; 32]; - unsafe { - self.compress_fn()(block, block_len, cv, counter, flags, &mut out); - } - out - } - - // The contract for HashChunksFn doesn't require the implementation to support single-chunk - // inputs. Instead we handle that case here by calling compress in a loop. - #[inline] - fn hash_one_chunk( - &self, - mut input: &[u8], - key: &CVBytes, - counter: u64, - mut flags: u32, - output: TransposedSplit, - ) { - debug_assert!(input.len() <= CHUNK_LEN); - let mut cv = *key; - flags |= CHUNK_START; - while input.len() > BLOCK_LEN { - cv = self.compress( - input[..BLOCK_LEN].try_into().unwrap(), - BLOCK_LEN as u32, - &cv, - counter, - flags, - ); - input = &input[BLOCK_LEN..]; - flags &= !CHUNK_START; - } - let mut final_block = [0u8; BLOCK_LEN]; - final_block[..input.len()].copy_from_slice(input); - cv = self.compress( - &final_block, - input.len() as u32, - &cv, - counter, - flags | CHUNK_END, - ); - unsafe { - write_transposed_cv(&words_from_le_bytes_32(&cv), output.ptr); - } - } - - #[inline] - fn hash_chunks_fn(&self) -> HashChunksFn { - unsafe { mem::transmute(self.hash_chunks_ptr.load(Relaxed)) } - } - - #[inline] - pub fn hash_chunks( - &self, - input: &[u8], - key: &CVBytes, - counter: u64, - flags: u32, - transposed_output: TransposedSplit, - ) -> usize { - debug_assert!(input.len() <= self.degree() * CHUNK_LEN); - if input.len() <= CHUNK_LEN { - // The underlying hash_chunks_fn isn't required to support this case. Instead we handle - // it by calling compress_fn in a loop. But note that we still don't support root - // finalization or the empty input here. - self.hash_one_chunk(input, key, counter, flags, transposed_output); - return 1; - } - // SAFETY: If the caller passes in more than MAX_SIMD_DEGREE * CHUNK_LEN bytes, silently - // ignore the remainder. This makes it impossible to write out of bounds in a properly - // constructed TransposedSplit. - let len = cmp::min(input.len(), MAX_SIMD_DEGREE * CHUNK_LEN); - unsafe { - self.hash_chunks_fn()( - input.as_ptr(), - len, - key, - counter, - flags, - transposed_output.ptr, - ); - } - if input.len() % CHUNK_LEN == 0 { - input.len() / CHUNK_LEN - } else { - (input.len() / CHUNK_LEN) + 1 - } - } - - #[inline] - fn hash_parents_fn(&self) -> HashParentsFn { - unsafe { mem::transmute(self.hash_parents_ptr.load(Relaxed)) } - } - - #[inline] - pub fn hash_parents( - &self, - transposed_input: &TransposedVectors, - mut num_cvs: usize, - key: &CVBytes, - flags: u32, - transposed_output: TransposedSplit, - ) -> usize { - debug_assert!(num_cvs <= 2 * MAX_SIMD_DEGREE); - // SAFETY: Cap num_cvs at 2 * MAX_SIMD_DEGREE, to guarantee no out-of-bounds accesses. - num_cvs = cmp::min(num_cvs, 2 * MAX_SIMD_DEGREE); - let mut odd_cv = [0u32; 8]; - if num_cvs % 2 == 1 { - unsafe { - odd_cv = read_transposed_cv(transposed_input.as_ptr().add(num_cvs - 1)); - } - } - let num_parents = num_cvs / 2; - unsafe { - self.hash_parents_fn()( - transposed_input.as_ptr(), - num_parents, - key, - flags | PARENT, - transposed_output.ptr, - ); - } - if num_cvs % 2 == 1 { - unsafe { - write_transposed_cv(&odd_cv, transposed_output.ptr.add(num_parents)); - } - num_parents + 1 - } else { - num_parents - } - } - - #[inline] - pub fn reduce_parents( - &self, - transposed_in_out: &mut TransposedVectors, - mut num_cvs: usize, - key: &CVBytes, - flags: u32, - ) -> usize { - debug_assert!(num_cvs <= 2 * MAX_SIMD_DEGREE); - // SAFETY: Cap num_cvs at 2 * MAX_SIMD_DEGREE, to guarantee no out-of-bounds accesses. - num_cvs = cmp::min(num_cvs, 2 * MAX_SIMD_DEGREE); - let in_out_ptr = transposed_in_out.as_mut_ptr(); - let mut odd_cv = [0u32; 8]; - if num_cvs % 2 == 1 { - unsafe { - odd_cv = read_transposed_cv(in_out_ptr.add(num_cvs - 1)); - } - } - let num_parents = num_cvs / 2; - unsafe { - self.hash_parents_fn()(in_out_ptr, num_parents, key, flags | PARENT, in_out_ptr); - } - if num_cvs % 2 == 1 { - unsafe { - write_transposed_cv(&odd_cv, in_out_ptr.add(num_parents)); - } - num_parents + 1 - } else { - num_parents - } - } - - #[inline] - fn xof_fn(&self) -> XofFn { - unsafe { mem::transmute(self.xof_ptr.load(Relaxed)) } - } - - #[inline] - pub fn xof( - &self, - block: &BlockBytes, - block_len: u32, - cv: &CVBytes, - mut counter: u64, - flags: u32, - mut out: &mut [u8], - ) { - let degree = self.degree(); - let simd_len = degree * BLOCK_LEN; - while !out.is_empty() { - let take = cmp::min(simd_len, out.len()); - unsafe { - self.xof_fn()( - block, - block_len, - cv, - counter, - flags | ROOT, - out.as_mut_ptr(), - take, - ); - } - out = &mut out[take..]; - counter += degree as u64; - } - } - - #[inline] - fn xof_xor_fn(&self) -> XofFn { - unsafe { mem::transmute(self.xof_xor_ptr.load(Relaxed)) } - } - - #[inline] - pub fn xof_xor( - &self, - block: &BlockBytes, - block_len: u32, - cv: &CVBytes, - mut counter: u64, - flags: u32, - mut out: &mut [u8], - ) { - let degree = self.degree(); - let simd_len = degree * BLOCK_LEN; - while !out.is_empty() { - let take = cmp::min(simd_len, out.len()); - unsafe { - self.xof_xor_fn()( - block, - block_len, - cv, - counter, - flags | ROOT, - out.as_mut_ptr(), - take, - ); - } - out = &mut out[take..]; - counter += degree as u64; - } - } - - #[inline] - fn universal_hash_fn(&self) -> UniversalHashFn { - unsafe { mem::transmute(self.universal_hash_ptr.load(Relaxed)) } - } - - #[inline] - pub fn universal_hash(&self, mut input: &[u8], key: &CVBytes, mut counter: u64) -> [u8; 16] { - let degree = self.degree(); - let simd_len = degree * BLOCK_LEN; - let mut ret = [0u8; 16]; - while !input.is_empty() { - let take = cmp::min(simd_len, input.len()); - let mut output = [0u8; 16]; - unsafe { - self.universal_hash_fn()(input.as_ptr(), take, key, counter, &mut output); - } - input = &input[take..]; - counter += degree as u64; - for byte_index in 0..16 { - ret[byte_index] ^= output[byte_index]; - } - } - ret - } -} - -impl Clone for Implementation { - fn clone(&self) -> Self { - Self { - degree_ptr: AtomicPtr::new(self.degree_ptr.load(Relaxed)), - compress_ptr: AtomicPtr::new(self.compress_ptr.load(Relaxed)), - hash_chunks_ptr: AtomicPtr::new(self.hash_chunks_ptr.load(Relaxed)), - hash_parents_ptr: AtomicPtr::new(self.hash_parents_ptr.load(Relaxed)), - xof_ptr: AtomicPtr::new(self.xof_ptr.load(Relaxed)), - xof_xor_ptr: AtomicPtr::new(self.xof_xor_ptr.load(Relaxed)), - universal_hash_ptr: AtomicPtr::new(self.universal_hash_ptr.load(Relaxed)), - } - } -} - -// never less than 2 -type DegreeFn = unsafe extern "C" fn() -> usize; - -unsafe extern "C" fn degree_init() -> usize { - init_detected_impl(); - DETECTED_IMPL.degree_fn()() -} - -type CompressFn = unsafe extern "C" fn( - block: *const BlockBytes, // zero padded to 64 bytes - block_len: u32, - cv: *const CVBytes, - counter: u64, - flags: u32, - out: *mut CVBytes, // may overlap the input -); - -unsafe extern "C" fn compress_init( - block: *const BlockBytes, - block_len: u32, - cv: *const CVBytes, - counter: u64, - flags: u32, - out: *mut CVBytes, -) { - init_detected_impl(); - DETECTED_IMPL.compress_fn()(block, block_len, cv, counter, flags, out); -} - -type CompressXofFn = unsafe extern "C" fn( - block: *const BlockBytes, // zero padded to 64 bytes - block_len: u32, - cv: *const CVBytes, - counter: u64, - flags: u32, - out: *mut BlockBytes, // may overlap the input -); - -type HashChunksFn = unsafe extern "C" fn( - input: *const u8, - input_len: usize, - key: *const CVBytes, - counter: u64, - flags: u32, - transposed_output: *mut u32, -); - -unsafe extern "C" fn hash_chunks_init( - input: *const u8, - input_len: usize, - key: *const CVBytes, - counter: u64, - flags: u32, - transposed_output: *mut u32, -) { - init_detected_impl(); - DETECTED_IMPL.hash_chunks_fn()(input, input_len, key, counter, flags, transposed_output); -} - -type HashParentsFn = unsafe extern "C" fn( - transposed_input: *const u32, - num_parents: usize, - key: *const CVBytes, - flags: u32, - transposed_output: *mut u32, // may overlap the input -); - -unsafe extern "C" fn hash_parents_init( - transposed_input: *const u32, - num_parents: usize, - key: *const CVBytes, - flags: u32, - transposed_output: *mut u32, -) { - init_detected_impl(); - DETECTED_IMPL.hash_parents_fn()(transposed_input, num_parents, key, flags, transposed_output); -} - -// This signature covers both xof() and xof_xor(). -type XofFn = unsafe extern "C" fn( - block: *const BlockBytes, // zero padded to 64 bytes - block_len: u32, - cv: *const CVBytes, - counter: u64, - flags: u32, - out: *mut u8, - out_len: usize, -); - -unsafe extern "C" fn xof_init( - block: *const BlockBytes, - block_len: u32, - cv: *const CVBytes, - counter: u64, - flags: u32, - out: *mut u8, - out_len: usize, -) { - init_detected_impl(); - DETECTED_IMPL.xof_fn()(block, block_len, cv, counter, flags, out, out_len); -} - -unsafe extern "C" fn xof_xor_init( - block: *const BlockBytes, - block_len: u32, - cv: *const CVBytes, - counter: u64, - flags: u32, - out: *mut u8, - out_len: usize, -) { - init_detected_impl(); - DETECTED_IMPL.xof_xor_fn()(block, block_len, cv, counter, flags, out, out_len); -} - -type UniversalHashFn = unsafe extern "C" fn( - input: *const u8, - input_len: usize, - key: *const CVBytes, - counter: u64, - out: *mut [u8; 16], -); - -unsafe extern "C" fn universal_hash_init( - input: *const u8, - input_len: usize, - key: *const CVBytes, - counter: u64, - out: *mut [u8; 16], -) { - init_detected_impl(); - DETECTED_IMPL.universal_hash_fn()(input, input_len, key, counter, out); -} - -// The implicit degree of this implementation is MAX_SIMD_DEGREE. -#[inline(always)] -unsafe fn hash_chunks_using_compress( - compress: CompressFn, - mut input: *const u8, - mut input_len: usize, - key: *const CVBytes, - mut counter: u64, - flags: u32, - mut transposed_output: *mut u32, -) { - debug_assert!(input_len > 0); - debug_assert!(input_len <= MAX_SIMD_DEGREE * CHUNK_LEN); - input_len = cmp::min(input_len, MAX_SIMD_DEGREE * CHUNK_LEN); - while input_len > 0 { - let mut chunk_len = cmp::min(input_len, CHUNK_LEN); - input_len -= chunk_len; - // We only use 8 words of the CV, but compress returns 16. - let mut cv = *key; - let cv_ptr: *mut CVBytes = &mut cv; - let mut chunk_flags = flags | CHUNK_START; - while chunk_len > BLOCK_LEN { - compress( - input as *const BlockBytes, - BLOCK_LEN as u32, - cv_ptr, - counter, - chunk_flags, - cv_ptr, - ); - input = input.add(BLOCK_LEN); - chunk_len -= BLOCK_LEN; - chunk_flags &= !CHUNK_START; - } - let mut last_block = [0u8; BLOCK_LEN]; - ptr::copy_nonoverlapping(input, last_block.as_mut_ptr(), chunk_len); - input = input.add(chunk_len); - compress( - &last_block, - chunk_len as u32, - cv_ptr, - counter, - chunk_flags | CHUNK_END, - cv_ptr, - ); - let cv_words = words_from_le_bytes_32(&cv); - for word_index in 0..8 { - transposed_output - .add(word_index * TRANSPOSED_STRIDE) - .write(cv_words[word_index]); - } - transposed_output = transposed_output.add(1); - counter += 1; - } -} - -// The implicit degree of this implementation is MAX_SIMD_DEGREE. -#[inline(always)] -unsafe fn hash_parents_using_compress( - compress: CompressFn, - mut transposed_input: *const u32, - mut num_parents: usize, - key: *const CVBytes, - flags: u32, - mut transposed_output: *mut u32, // may overlap the input -) { - debug_assert!(num_parents > 0); - debug_assert!(num_parents <= MAX_SIMD_DEGREE); - while num_parents > 0 { - let mut block_bytes = [0u8; 64]; - for word_index in 0..8 { - let left_child_word = transposed_input.add(word_index * TRANSPOSED_STRIDE).read(); - block_bytes[WORD_LEN * word_index..][..WORD_LEN] - .copy_from_slice(&left_child_word.to_le_bytes()); - let right_child_word = transposed_input - .add(word_index * TRANSPOSED_STRIDE + 1) - .read(); - block_bytes[WORD_LEN * (word_index + 8)..][..WORD_LEN] - .copy_from_slice(&right_child_word.to_le_bytes()); - } - let mut cv = [0u8; 32]; - compress(&block_bytes, BLOCK_LEN as u32, key, 0, flags, &mut cv); - let cv_words = words_from_le_bytes_32(&cv); - for word_index in 0..8 { - transposed_output - .add(word_index * TRANSPOSED_STRIDE) - .write(cv_words[word_index]); - } - transposed_input = transposed_input.add(2); - transposed_output = transposed_output.add(1); - num_parents -= 1; - } -} - -#[inline(always)] -unsafe fn xof_using_compress_xof( - compress_xof: CompressXofFn, - block: *const BlockBytes, - block_len: u32, - cv: *const CVBytes, - mut counter: u64, - flags: u32, - mut out: *mut u8, - mut out_len: usize, -) { - debug_assert!(out_len <= MAX_SIMD_DEGREE * BLOCK_LEN); - while out_len > 0 { - let mut block_output = [0u8; 64]; - compress_xof(block, block_len, cv, counter, flags, &mut block_output); - let take = cmp::min(out_len, BLOCK_LEN); - ptr::copy_nonoverlapping(block_output.as_ptr(), out, take); - out = out.add(take); - out_len -= take; - counter += 1; - } -} - -#[inline(always)] -unsafe fn xof_xor_using_compress_xof( - compress_xof: CompressXofFn, - block: *const BlockBytes, - block_len: u32, - cv: *const CVBytes, - mut counter: u64, - flags: u32, - mut out: *mut u8, - mut out_len: usize, -) { - debug_assert!(out_len <= MAX_SIMD_DEGREE * BLOCK_LEN); - while out_len > 0 { - let mut block_output = [0u8; 64]; - compress_xof(block, block_len, cv, counter, flags, &mut block_output); - let take = cmp::min(out_len, BLOCK_LEN); - for i in 0..take { - *out.add(i) ^= block_output[i]; - } - out = out.add(take); - out_len -= take; - counter += 1; - } -} - -#[inline(always)] -unsafe fn universal_hash_using_compress( - compress: CompressFn, - mut input: *const u8, - mut input_len: usize, - key: *const CVBytes, - mut counter: u64, - out: *mut [u8; 16], -) { - let flags = KEYED_HASH | CHUNK_START | CHUNK_END | ROOT; - let mut result = [0u8; 16]; - while input_len > 0 { - let block_len = cmp::min(input_len, BLOCK_LEN); - let mut block = [0u8; BLOCK_LEN]; - ptr::copy_nonoverlapping(input, block.as_mut_ptr(), block_len); - let mut block_output = [0u8; 32]; - compress( - &block, - block_len as u32, - key, - counter, - flags, - &mut block_output, - ); - for i in 0..16 { - result[i] ^= block_output[i]; - } - input = input.add(block_len); - input_len -= block_len; - counter += 1; - } - *out = result; -} - -// this is in units of *words*, for pointer operations on *const/*mut u32 -const TRANSPOSED_STRIDE: usize = 2 * MAX_SIMD_DEGREE; - -#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), repr(C, align(64)))] -#[derive(Clone, Debug, PartialEq, Eq)] -pub struct TransposedVectors([[u32; 2 * MAX_SIMD_DEGREE]; 8]); - -impl TransposedVectors { - pub fn new() -> Self { - Self([[0; 2 * MAX_SIMD_DEGREE]; 8]) - } - - pub fn extract_cv(&self, cv_index: usize) -> CVBytes { - let mut words = [0u32; 8]; - for word_index in 0..8 { - words[word_index] = self.0[word_index][cv_index]; - } - le_bytes_from_words_32(&words) - } - - pub fn extract_parent_node(&self, parent_index: usize) -> BlockBytes { - let mut bytes = [0u8; 64]; - bytes[..32].copy_from_slice(&self.extract_cv(parent_index / 2)); - bytes[32..].copy_from_slice(&self.extract_cv(parent_index / 2 + 1)); - bytes - } - - fn as_ptr(&self) -> *const u32 { - self.0[0].as_ptr() - } - - fn as_mut_ptr(&mut self) -> *mut u32 { - self.0[0].as_mut_ptr() - } - - // SAFETY: This function is just pointer arithmetic, but callers assume that it's safe (not - // necessarily correct) to write up to `degree` words to either side of the split, possibly - // from different threads. - unsafe fn split(&mut self, degree: usize) -> (TransposedSplit, TransposedSplit) { - debug_assert!(degree > 0); - debug_assert!(degree <= MAX_SIMD_DEGREE); - debug_assert_eq!(degree.count_ones(), 1, "power of 2"); - let ptr = self.as_mut_ptr(); - let left = TransposedSplit { - ptr, - phantom_data: PhantomData, - }; - let right = TransposedSplit { - ptr: ptr.wrapping_add(degree), - phantom_data: PhantomData, - }; - (left, right) - } -} - -pub struct TransposedSplit<'vectors> { - ptr: *mut u32, - phantom_data: PhantomData<&'vectors mut u32>, -} - -unsafe impl<'vectors> Send for TransposedSplit<'vectors> {} -unsafe impl<'vectors> Sync for TransposedSplit<'vectors> {} - -unsafe fn read_transposed_cv(src: *const u32) -> CVWords { - let mut cv = [0u32; 8]; - for word_index in 0..8 { - let offset_words = word_index * TRANSPOSED_STRIDE; - cv[word_index] = src.add(offset_words).read(); - } - cv -} - -unsafe fn write_transposed_cv(cv: &CVWords, dest: *mut u32) { - for word_index in 0..8 { - let offset_words = word_index * TRANSPOSED_STRIDE; - dest.add(offset_words).write(cv[word_index]); - } -} - -#[inline(always)] -pub const fn le_bytes_from_words_32(words: &CVWords) -> CVBytes { - let mut bytes = [0u8; 32]; - // This loop is super verbose because currently that's what it takes to be const. - let mut word_index = 0; - while word_index < bytes.len() / WORD_LEN { - let word_bytes = words[word_index].to_le_bytes(); - let mut byte_index = 0; - while byte_index < WORD_LEN { - bytes[word_index * WORD_LEN + byte_index] = word_bytes[byte_index]; - byte_index += 1; - } - word_index += 1; - } - bytes -} - -#[inline(always)] -pub const fn le_bytes_from_words_64(words: &BlockWords) -> BlockBytes { - let mut bytes = [0u8; 64]; - // This loop is super verbose because currently that's what it takes to be const. - let mut word_index = 0; - while word_index < bytes.len() / WORD_LEN { - let word_bytes = words[word_index].to_le_bytes(); - let mut byte_index = 0; - while byte_index < WORD_LEN { - bytes[word_index * WORD_LEN + byte_index] = word_bytes[byte_index]; - byte_index += 1; - } - word_index += 1; - } - bytes -} - -#[inline(always)] -pub const fn words_from_le_bytes_32(bytes: &CVBytes) -> CVWords { - let mut words = [0u32; 8]; - // This loop is super verbose because currently that's what it takes to be const. - let mut word_index = 0; - while word_index < words.len() { - let mut word_bytes = [0u8; WORD_LEN]; - let mut byte_index = 0; - while byte_index < WORD_LEN { - word_bytes[byte_index] = bytes[word_index * WORD_LEN + byte_index]; - byte_index += 1; - } - words[word_index] = u32::from_le_bytes(word_bytes); - word_index += 1; - } - words -} - -#[inline(always)] -pub const fn words_from_le_bytes_64(bytes: &BlockBytes) -> BlockWords { - let mut words = [0u32; 16]; - // This loop is super verbose because currently that's what it takes to be const. - let mut word_index = 0; - while word_index < words.len() { - let mut word_bytes = [0u8; WORD_LEN]; - let mut byte_index = 0; - while byte_index < WORD_LEN { - word_bytes[byte_index] = bytes[word_index * WORD_LEN + byte_index]; - byte_index += 1; - } - words[word_index] = u32::from_le_bytes(word_bytes); - word_index += 1; - } - words -} - -#[test] -fn test_byte_word_round_trips() { - let cv = *b"This is 32 LE bytes/eight words."; - assert_eq!(cv, le_bytes_from_words_32(&words_from_le_bytes_32(&cv))); - let block = *b"This is sixty-four little-endian bytes, or sixteen 32-bit words."; - assert_eq!( - block, - le_bytes_from_words_64(&words_from_le_bytes_64(&block)), - ); -} - -// The largest power of two less than or equal to `n`, used for left_len() -// immediately below, and also directly in Hasher::update(). -pub fn largest_power_of_two_leq(n: usize) -> usize { - ((n / 2) + 1).next_power_of_two() -} - -#[test] -fn test_largest_power_of_two_leq() { - let input_output = &[ - // The zero case is nonsensical, but it does work. - (0, 1), - (1, 1), - (2, 2), - (3, 2), - (4, 4), - (5, 4), - (6, 4), - (7, 4), - (8, 8), - // the largest possible usize - (usize::MAX, (usize::MAX >> 1) + 1), - ]; - for &(input, output) in input_output { - assert_eq!( - output, - crate::largest_power_of_two_leq(input), - "wrong output for n={}", - input - ); - } -} - -// Given some input larger than one chunk, return the number of bytes that -// should go in the left subtree. This is the largest power-of-2 number of -// chunks that leaves at least 1 byte for the right subtree. -pub fn left_len(content_len: usize) -> usize { - debug_assert!(content_len > CHUNK_LEN); - // Subtract 1 to reserve at least one byte for the right side. - let full_chunks = (content_len - 1) / CHUNK_LEN; - largest_power_of_two_leq(full_chunks) * CHUNK_LEN -} - -#[test] -fn test_left_len() { - let input_output = &[ - (CHUNK_LEN + 1, CHUNK_LEN), - (2 * CHUNK_LEN - 1, CHUNK_LEN), - (2 * CHUNK_LEN, CHUNK_LEN), - (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN), - (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN), - (4 * CHUNK_LEN, 2 * CHUNK_LEN), - (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN), - ]; - for &(input, output) in input_output { - assert_eq!(left_len(input), output); - } -} diff --git a/rust/guts/src/portable.rs b/rust/guts/src/portable.rs deleted file mode 100644 index d597644..0000000 --- a/rust/guts/src/portable.rs +++ /dev/null @@ -1,262 +0,0 @@ -use crate::{ - le_bytes_from_words_32, le_bytes_from_words_64, words_from_le_bytes_32, words_from_le_bytes_64, - BlockBytes, BlockWords, CVBytes, CVWords, Implementation, IV, MAX_SIMD_DEGREE, MSG_SCHEDULE, -}; - -const DEGREE: usize = MAX_SIMD_DEGREE; - -unsafe extern "C" fn degree() -> usize { - DEGREE -} - -#[inline(always)] -fn g(state: &mut BlockWords, a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) { - state[a] = state[a].wrapping_add(state[b]).wrapping_add(x); - state[d] = (state[d] ^ state[a]).rotate_right(16); - state[c] = state[c].wrapping_add(state[d]); - state[b] = (state[b] ^ state[c]).rotate_right(12); - state[a] = state[a].wrapping_add(state[b]).wrapping_add(y); - state[d] = (state[d] ^ state[a]).rotate_right(8); - state[c] = state[c].wrapping_add(state[d]); - state[b] = (state[b] ^ state[c]).rotate_right(7); -} - -#[inline(always)] -fn round(state: &mut [u32; 16], msg: &BlockWords, round: usize) { - // Select the message schedule based on the round. - let schedule = MSG_SCHEDULE[round]; - - // Mix the columns. - g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); - g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); - g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); - g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); - - // Mix the diagonals. - g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); - g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); - g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); - g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); -} - -#[inline(always)] -fn compress_inner( - block_words: &BlockWords, - block_len: u32, - cv_words: &CVWords, - counter: u64, - flags: u32, -) -> [u32; 16] { - let mut state = [ - cv_words[0], - cv_words[1], - cv_words[2], - cv_words[3], - cv_words[4], - cv_words[5], - cv_words[6], - cv_words[7], - IV[0], - IV[1], - IV[2], - IV[3], - counter as u32, - (counter >> 32) as u32, - block_len as u32, - flags as u32, - ]; - for round_number in 0..7 { - round(&mut state, &block_words, round_number); - } - state -} - -pub(crate) unsafe extern "C" fn compress( - block: *const BlockBytes, - block_len: u32, - cv: *const CVBytes, - counter: u64, - flags: u32, - out: *mut CVBytes, -) { - let block_words = words_from_le_bytes_64(&*block); - let cv_words = words_from_le_bytes_32(&*cv); - let mut state = compress_inner(&block_words, block_len, &cv_words, counter, flags); - for word_index in 0..8 { - state[word_index] ^= state[word_index + 8]; - } - *out = le_bytes_from_words_32(state[..8].try_into().unwrap()); -} - -pub(crate) unsafe extern "C" fn compress_xof( - block: *const BlockBytes, - block_len: u32, - cv: *const CVBytes, - counter: u64, - flags: u32, - out: *mut BlockBytes, -) { - let block_words = words_from_le_bytes_64(&*block); - let cv_words = words_from_le_bytes_32(&*cv); - let mut state = compress_inner(&block_words, block_len, &cv_words, counter, flags); - for word_index in 0..8 { - state[word_index] ^= state[word_index + 8]; - state[word_index + 8] ^= cv_words[word_index]; - } - *out = le_bytes_from_words_64(&state); -} - -pub(crate) unsafe extern "C" fn hash_chunks( - input: *const u8, - input_len: usize, - key: *const CVBytes, - counter: u64, - flags: u32, - transposed_output: *mut u32, -) { - crate::hash_chunks_using_compress( - compress, - input, - input_len, - key, - counter, - flags, - transposed_output, - ) -} - -pub(crate) unsafe extern "C" fn hash_parents( - transposed_input: *const u32, - num_parents: usize, - key: *const CVBytes, - flags: u32, - transposed_output: *mut u32, // may overlap the input -) { - crate::hash_parents_using_compress( - compress, - transposed_input, - num_parents, - key, - flags, - transposed_output, - ) -} - -pub(crate) unsafe extern "C" fn xof( - block: *const BlockBytes, - block_len: u32, - cv: *const CVBytes, - counter: u64, - flags: u32, - out: *mut u8, - out_len: usize, -) { - crate::xof_using_compress_xof( - compress_xof, - block, - block_len, - cv, - counter, - flags, - out, - out_len, - ) -} - -pub(crate) unsafe extern "C" fn xof_xor( - block: *const BlockBytes, - block_len: u32, - cv: *const CVBytes, - counter: u64, - flags: u32, - out: *mut u8, - out_len: usize, -) { - crate::xof_xor_using_compress_xof( - compress_xof, - block, - block_len, - cv, - counter, - flags, - out, - out_len, - ) -} - -pub(crate) unsafe extern "C" fn universal_hash( - input: *const u8, - input_len: usize, - key: *const CVBytes, - counter: u64, - out: *mut [u8; 16], -) { - crate::universal_hash_using_compress(compress, input, input_len, key, counter, out) -} - -pub fn implementation() -> Implementation { - Implementation::new( - degree, - compress, - hash_chunks, - hash_parents, - xof, - xof_xor, - universal_hash, - ) -} - -#[cfg(test)] -mod test { - use super::*; - - // This is circular but do it anyway. - #[test] - fn test_compress_vs_portable() { - crate::test::test_compress_vs_portable(&implementation()); - } - - #[test] - fn test_compress_vs_reference() { - crate::test::test_compress_vs_reference(&implementation()); - } - - // This is circular but do it anyway. - #[test] - fn test_hash_chunks_vs_portable() { - crate::test::test_hash_chunks_vs_portable(&implementation()); - } - - // This is circular but do it anyway. - #[test] - fn test_hash_parents_vs_portable() { - crate::test::test_hash_parents_vs_portable(&implementation()); - } - - #[test] - fn test_chunks_and_parents_vs_reference() { - crate::test::test_chunks_and_parents_vs_reference(&implementation()); - } - - // This is circular but do it anyway. - #[test] - fn test_xof_vs_portable() { - crate::test::test_xof_vs_portable(&implementation()); - } - - #[test] - fn test_xof_vs_reference() { - crate::test::test_xof_vs_reference(&implementation()); - } - - // This is circular but do it anyway. - #[test] - fn test_universal_hash_vs_portable() { - crate::test::test_universal_hash_vs_portable(&implementation()); - } - - #[test] - fn test_universal_hash_vs_reference() { - crate::test::test_universal_hash_vs_reference(&implementation()); - } -} diff --git a/rust/guts/src/test.rs b/rust/guts/src/test.rs deleted file mode 100644 index 83bd790..0000000 --- a/rust/guts/src/test.rs +++ /dev/null @@ -1,523 +0,0 @@ -use crate::*; - -pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend"; - -// Test a few different initial counter values. -// - 0: The base case. -// - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR when -// you're supposed to ANDNOT. -// - u32::MAX: The low word of the counter overflows for all inputs except the first. -// - (42 << 32) + u32::MAX: Same but with a non-zero value in the high word. -const INITIAL_COUNTERS: [u64; 4] = [ - 0, - i32::MAX as u64, - u32::MAX as u64, - (42u64 << 32) + u32::MAX as u64, -]; - -const BLOCK_LENGTHS: [usize; 4] = [0, 1, 63, 64]; - -pub fn paint_test_input(buf: &mut [u8]) { - for (i, b) in buf.iter_mut().enumerate() { - *b = (i % 251) as u8; - } -} - -pub fn test_compress_vs_portable(test_impl: &Implementation) { - for block_len in BLOCK_LENGTHS { - dbg!(block_len); - let mut block = [0; BLOCK_LEN]; - paint_test_input(&mut block[..block_len]); - for counter in INITIAL_COUNTERS { - dbg!(counter); - let portable_cv = portable::implementation().compress( - &block, - block_len as u32, - &TEST_KEY, - counter, - KEYED_HASH, - ); - - let test_cv = - test_impl.compress(&block, block_len as u32, &TEST_KEY, counter, KEYED_HASH); - - assert_eq!(portable_cv, test_cv); - } - } -} - -pub fn test_compress_vs_reference(test_impl: &Implementation) { - for block_len in BLOCK_LENGTHS { - dbg!(block_len); - let mut block = [0; BLOCK_LEN]; - paint_test_input(&mut block[..block_len]); - - let mut ref_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); - ref_hasher.update(&block[..block_len]); - let mut ref_hash = [0u8; 32]; - ref_hasher.finalize(&mut ref_hash); - - let test_cv = test_impl.compress( - &block, - block_len as u32, - &TEST_KEY, - 0, - CHUNK_START | CHUNK_END | ROOT | KEYED_HASH, - ); - - assert_eq!(ref_hash, test_cv); - } -} - -fn check_transposed_eq(output_a: &TransposedVectors, output_b: &TransposedVectors) { - if output_a == output_b { - return; - } - for cv_index in 0..2 * MAX_SIMD_DEGREE { - let cv_a = output_a.extract_cv(cv_index); - let cv_b = output_b.extract_cv(cv_index); - if cv_a == [0; 32] && cv_b == [0; 32] { - println!("CV {cv_index:2} empty"); - } else if cv_a == cv_b { - println!("CV {cv_index:2} matches"); - } else { - println!("CV {cv_index:2} mismatch:"); - println!(" {}", hex::encode(cv_a)); - println!(" {}", hex::encode(cv_b)); - } - } - panic!("transposed outputs are not equal"); -} - -pub fn test_hash_chunks_vs_portable(test_impl: &Implementation) { - assert!(test_impl.degree() <= MAX_SIMD_DEGREE); - dbg!(test_impl.degree() * CHUNK_LEN); - // Allocate 4 extra bytes of padding so we can make aligned slices. - let mut input_buf = [0u8; 2 * 2 * MAX_SIMD_DEGREE * CHUNK_LEN + 4]; - let mut input_slice = &mut input_buf[..]; - // Make sure the start of the input is word-aligned. - while input_slice.as_ptr() as usize % 4 != 0 { - input_slice = &mut input_slice[1..]; - } - let (aligned_input, mut unaligned_input) = - input_slice.split_at_mut(2 * MAX_SIMD_DEGREE * CHUNK_LEN); - unaligned_input = &mut unaligned_input[1..][..2 * MAX_SIMD_DEGREE * CHUNK_LEN]; - assert_eq!(aligned_input.as_ptr() as usize % 4, 0); - assert_eq!(unaligned_input.as_ptr() as usize % 4, 1); - paint_test_input(aligned_input); - paint_test_input(unaligned_input); - // Try just below, equal to, and just above every whole number of chunks. - let mut input_2_lengths = Vec::new(); - let mut next_len = 2 * CHUNK_LEN; - loop { - // 95 is one whole block plus one interesting part of another - input_2_lengths.push(next_len - 95); - input_2_lengths.push(next_len); - if next_len == test_impl.degree() * CHUNK_LEN { - break; - } - input_2_lengths.push(next_len + 95); - next_len += CHUNK_LEN; - } - for input_2_len in input_2_lengths { - dbg!(input_2_len); - let aligned_input1 = &aligned_input[..test_impl.degree() * CHUNK_LEN]; - let aligned_input2 = &aligned_input[test_impl.degree() * CHUNK_LEN..][..input_2_len]; - let unaligned_input1 = &unaligned_input[..test_impl.degree() * CHUNK_LEN]; - let unaligned_input2 = &unaligned_input[test_impl.degree() * CHUNK_LEN..][..input_2_len]; - for initial_counter in INITIAL_COUNTERS { - dbg!(initial_counter); - // Make two calls, to test the output_column parameter. - let mut portable_output = TransposedVectors::new(); - let (portable_left, portable_right) = - test_impl.split_transposed_vectors(&mut portable_output); - portable::implementation().hash_chunks( - aligned_input1, - &IV_BYTES, - initial_counter, - 0, - portable_left, - ); - portable::implementation().hash_chunks( - aligned_input2, - &TEST_KEY, - initial_counter + test_impl.degree() as u64, - KEYED_HASH, - portable_right, - ); - - let mut test_output = TransposedVectors::new(); - let (test_left, test_right) = test_impl.split_transposed_vectors(&mut test_output); - test_impl.hash_chunks(aligned_input1, &IV_BYTES, initial_counter, 0, test_left); - test_impl.hash_chunks( - aligned_input2, - &TEST_KEY, - initial_counter + test_impl.degree() as u64, - KEYED_HASH, - test_right, - ); - check_transposed_eq(&portable_output, &test_output); - - // Do the same thing with unaligned input. - let mut unaligned_test_output = TransposedVectors::new(); - let (unaligned_left, unaligned_right) = - test_impl.split_transposed_vectors(&mut unaligned_test_output); - test_impl.hash_chunks( - unaligned_input1, - &IV_BYTES, - initial_counter, - 0, - unaligned_left, - ); - test_impl.hash_chunks( - unaligned_input2, - &TEST_KEY, - initial_counter + test_impl.degree() as u64, - KEYED_HASH, - unaligned_right, - ); - check_transposed_eq(&portable_output, &unaligned_test_output); - } - } -} - -fn painted_transposed_input() -> TransposedVectors { - let mut vectors = TransposedVectors::new(); - let mut val = 0; - for col in 0..2 * MAX_SIMD_DEGREE { - for row in 0..8 { - vectors.0[row][col] = val; - val += 1; - } - } - vectors -} - -pub fn test_hash_parents_vs_portable(test_impl: &Implementation) { - assert!(test_impl.degree() <= MAX_SIMD_DEGREE); - let input = painted_transposed_input(); - for num_parents in 2..=(test_impl.degree() / 2) { - dbg!(num_parents); - let mut portable_output = TransposedVectors::new(); - let (portable_left, portable_right) = - test_impl.split_transposed_vectors(&mut portable_output); - portable::implementation().hash_parents( - &input, - 2 * num_parents, // num_cvs - &IV_BYTES, - 0, - portable_left, - ); - portable::implementation().hash_parents( - &input, - 2 * num_parents, // num_cvs - &TEST_KEY, - KEYED_HASH, - portable_right, - ); - - let mut test_output = TransposedVectors::new(); - let (test_left, test_right) = test_impl.split_transposed_vectors(&mut test_output); - test_impl.hash_parents( - &input, - 2 * num_parents, // num_cvs - &IV_BYTES, - 0, - test_left, - ); - test_impl.hash_parents( - &input, - 2 * num_parents, // num_cvs - &TEST_KEY, - KEYED_HASH, - test_right, - ); - - check_transposed_eq(&portable_output, &test_output); - } -} - -fn hash_with_chunks_and_parents_recurse( - test_impl: &Implementation, - input: &[u8], - counter: u64, - output: TransposedSplit, -) -> usize { - assert!(input.len() > 0); - if input.len() <= test_impl.degree() * CHUNK_LEN { - return test_impl.hash_chunks(input, &IV_BYTES, counter, 0, output); - } - let (left_input, right_input) = input.split_at(left_len(input.len())); - let mut child_output = TransposedVectors::new(); - let (left_output, right_output) = test_impl.split_transposed_vectors(&mut child_output); - let mut children = - hash_with_chunks_and_parents_recurse(test_impl, left_input, counter, left_output); - assert_eq!(children, test_impl.degree()); - children += hash_with_chunks_and_parents_recurse( - test_impl, - right_input, - counter + (left_input.len() / CHUNK_LEN) as u64, - right_output, - ); - test_impl.hash_parents(&child_output, children, &IV_BYTES, PARENT, output) -} - -// Note: This test implementation doesn't support the 1-chunk-or-less case. -fn root_hash_with_chunks_and_parents(test_impl: &Implementation, input: &[u8]) -> CVBytes { - // TODO: handle the 1-chunk case? - assert!(input.len() > CHUNK_LEN); - let mut cvs = TransposedVectors::new(); - // The right half of these vectors are never used. - let (cvs_left, _) = test_impl.split_transposed_vectors(&mut cvs); - let mut num_cvs = hash_with_chunks_and_parents_recurse(test_impl, input, 0, cvs_left); - while num_cvs > 2 { - num_cvs = test_impl.reduce_parents(&mut cvs, num_cvs, &IV_BYTES, 0); - } - test_impl.compress( - &cvs.extract_parent_node(0), - BLOCK_LEN as u32, - &IV_BYTES, - 0, - PARENT | ROOT, - ) -} - -pub fn test_chunks_and_parents_vs_reference(test_impl: &Implementation) { - assert_eq!(test_impl.degree().count_ones(), 1, "power of 2"); - const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * CHUNK_LEN; - let mut input_buf = [0u8; MAX_INPUT_LEN]; - paint_test_input(&mut input_buf); - // Try just below, equal to, and just above every whole number of chunks, except that - // root_hash_with_chunks_and_parents doesn't support the 1-chunk-or-less case. - let mut test_lengths = vec![CHUNK_LEN + 1]; - let mut next_len = 2 * CHUNK_LEN; - loop { - // 95 is one whole block plus one interesting part of another - test_lengths.push(next_len - 95); - test_lengths.push(next_len); - if next_len == MAX_INPUT_LEN { - break; - } - test_lengths.push(next_len + 95); - next_len += CHUNK_LEN; - } - for test_len in test_lengths { - dbg!(test_len); - let input = &input_buf[..test_len]; - - let mut ref_hasher = reference_impl::Hasher::new(); - ref_hasher.update(&input); - let mut ref_hash = [0u8; 32]; - ref_hasher.finalize(&mut ref_hash); - - let test_hash = root_hash_with_chunks_and_parents(test_impl, input); - - assert_eq!(ref_hash, test_hash); - } -} - -pub fn test_xof_vs_portable(test_impl: &Implementation) { - let flags = CHUNK_START | CHUNK_END | KEYED_HASH; - for counter in INITIAL_COUNTERS { - dbg!(counter); - for input_len in [0, 1, BLOCK_LEN] { - dbg!(input_len); - let mut input_block = [0u8; BLOCK_LEN]; - for byte_index in 0..input_len { - input_block[byte_index] = byte_index as u8 + 42; - } - // Try equal to and partway through every whole number of output blocks. - const MAX_OUTPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN; - let mut output_lengths = Vec::new(); - let mut next_len = 0; - loop { - output_lengths.push(next_len); - if next_len == MAX_OUTPUT_LEN { - break; - } - output_lengths.push(next_len + 31); - next_len += BLOCK_LEN; - } - for output_len in output_lengths { - dbg!(output_len); - let mut portable_output = [0xff; MAX_OUTPUT_LEN]; - portable::implementation().xof( - &input_block, - input_len as u32, - &TEST_KEY, - counter, - flags, - &mut portable_output[..output_len], - ); - let mut test_output = [0xff; MAX_OUTPUT_LEN]; - test_impl.xof( - &input_block, - input_len as u32, - &TEST_KEY, - counter, - flags, - &mut test_output[..output_len], - ); - assert_eq!(portable_output, test_output); - - // Double check that the implementation didn't overwrite. - assert!(test_output[output_len..].iter().all(|&b| b == 0xff)); - - // The first XOR cancels out the output. - test_impl.xof_xor( - &input_block, - input_len as u32, - &TEST_KEY, - counter, - flags, - &mut test_output[..output_len], - ); - assert!(test_output[..output_len].iter().all(|&b| b == 0)); - assert!(test_output[output_len..].iter().all(|&b| b == 0xff)); - - // The second XOR restores out the output. - test_impl.xof_xor( - &input_block, - input_len as u32, - &TEST_KEY, - counter, - flags, - &mut test_output[..output_len], - ); - assert_eq!(portable_output, test_output); - assert!(test_output[output_len..].iter().all(|&b| b == 0xff)); - } - } - } -} - -pub fn test_xof_vs_reference(test_impl: &Implementation) { - let input = b"hello world"; - let mut input_block = [0; BLOCK_LEN]; - input_block[..input.len()].copy_from_slice(input); - - const MAX_OUTPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN; - let mut ref_output = [0; MAX_OUTPUT_LEN]; - let mut ref_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); - ref_hasher.update(input); - ref_hasher.finalize(&mut ref_output); - - // Try equal to and partway through every whole number of output blocks. - let mut output_lengths = vec![0, 1, 31]; - let mut next_len = BLOCK_LEN; - loop { - output_lengths.push(next_len); - if next_len == MAX_OUTPUT_LEN { - break; - } - output_lengths.push(next_len + 31); - next_len += BLOCK_LEN; - } - - for output_len in output_lengths { - dbg!(output_len); - let mut test_output = [0; MAX_OUTPUT_LEN]; - test_impl.xof( - &input_block, - input.len() as u32, - &TEST_KEY, - 0, - KEYED_HASH | CHUNK_START | CHUNK_END, - &mut test_output[..output_len], - ); - assert_eq!(ref_output[..output_len], test_output[..output_len]); - - // Double check that the implementation didn't overwrite. - assert!(test_output[output_len..].iter().all(|&b| b == 0)); - - // Do it again starting from block 1. - if output_len >= BLOCK_LEN { - test_impl.xof( - &input_block, - input.len() as u32, - &TEST_KEY, - 1, - KEYED_HASH | CHUNK_START | CHUNK_END, - &mut test_output[..output_len - BLOCK_LEN], - ); - assert_eq!( - ref_output[BLOCK_LEN..output_len], - test_output[..output_len - BLOCK_LEN], - ); - } - } -} - -pub fn test_universal_hash_vs_portable(test_impl: &Implementation) { - const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN; - let mut input_buf = [0; MAX_INPUT_LEN]; - paint_test_input(&mut input_buf); - // Try equal to and partway through every whole number of input blocks. - let mut input_lengths = vec![0, 1, 31]; - let mut next_len = BLOCK_LEN; - loop { - input_lengths.push(next_len); - if next_len == MAX_INPUT_LEN { - break; - } - input_lengths.push(next_len + 31); - next_len += BLOCK_LEN; - } - for input_len in input_lengths { - dbg!(input_len); - for counter in INITIAL_COUNTERS { - dbg!(counter); - let portable_output = portable::implementation().universal_hash( - &input_buf[..input_len], - &TEST_KEY, - counter, - ); - let test_output = test_impl.universal_hash(&input_buf[..input_len], &TEST_KEY, counter); - assert_eq!(portable_output, test_output); - } - } -} - -fn reference_impl_universal_hash(input: &[u8], key: &CVBytes) -> [u8; UNIVERSAL_HASH_LEN] { - // The reference_impl doesn't support XOF seeking, so we have to materialize an entire extended - // output to seek to a block. - const MAX_BLOCKS: usize = 2 * MAX_SIMD_DEGREE; - assert!(input.len() / BLOCK_LEN <= MAX_BLOCKS); - let mut output_buffer: [u8; BLOCK_LEN * MAX_BLOCKS] = [0u8; BLOCK_LEN * MAX_BLOCKS]; - let mut result = [0u8; UNIVERSAL_HASH_LEN]; - let mut block_start = 0; - while block_start < input.len() { - let block_len = cmp::min(input.len() - block_start, BLOCK_LEN); - let mut ref_hasher = reference_impl::Hasher::new_keyed(key); - ref_hasher.update(&input[block_start..block_start + block_len]); - ref_hasher.finalize(&mut output_buffer[..block_start + UNIVERSAL_HASH_LEN]); - for byte_index in 0..UNIVERSAL_HASH_LEN { - result[byte_index] ^= output_buffer[block_start + byte_index]; - } - block_start += BLOCK_LEN; - } - result -} - -pub fn test_universal_hash_vs_reference(test_impl: &Implementation) { - const MAX_INPUT_LEN: usize = 2 * MAX_SIMD_DEGREE * BLOCK_LEN; - let mut input_buf = [0; MAX_INPUT_LEN]; - paint_test_input(&mut input_buf); - // Try equal to and partway through every whole number of input blocks. - let mut input_lengths = vec![0, 1, 31]; - let mut next_len = BLOCK_LEN; - loop { - input_lengths.push(next_len); - if next_len == MAX_INPUT_LEN { - break; - } - input_lengths.push(next_len + 31); - next_len += BLOCK_LEN; - } - for input_len in input_lengths { - dbg!(input_len); - let ref_output = reference_impl_universal_hash(&input_buf[..input_len], &TEST_KEY); - let test_output = test_impl.universal_hash(&input_buf[..input_len], &TEST_KEY, 0); - assert_eq!(ref_output, test_output); - } -} -- cgit v1.2.3-70-g09d2