summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordivinity76 <hans@loltek.net>2024-03-12 08:21:51 +0100
committerGitHub <noreply@github.com>2024-03-12 03:21:51 -0400
commit58bea0bcbba3629043939aa499068055dd0df017 (patch)
tree9f2b9aba4dbeca3d7e87d1c4ad4c094f17b84afc
parent5b9af1c34746e20b4596c1812b683624bdcfc152 (diff)
optimize neon loadu_128/storeu_128 (#384)
vld1q_u8 and vst1q_u8 has no alignment requirements. This improves performance on Oracle Cloud's VM.Standard.A1.Flex by 1.15% on a 16*1024 input, from 13920 nanoseconds down to 13800 nanoseconds (approx)
-rw-r--r--c/blake3_neon.c6
1 files changed, 2 insertions, 4 deletions
diff --git a/c/blake3_neon.c b/c/blake3_neon.c
index 8a818fc..90bdd57 100644
--- a/c/blake3_neon.c
+++ b/c/blake3_neon.c
@@ -10,14 +10,12 @@
INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
// vld1q_u32 has alignment requirements. Don't use it.
- uint32x4_t x;
- memcpy(&x, src, 16);
- return x;
+ return vreinterpretq_u32_u8(vld1q_u8(src));
}
INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
// vst1q_u32 has alignment requirements. Don't use it.
- memcpy(dest, &src, 16);
+ vst1q_u8(dest, vreinterpretq_u8_u32(src));
}
INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {