diff options
author | Samuel Neves <sneves@dei.uc.pt> | 2020-01-23 11:58:36 +0000 |
---|---|---|
committer | Samuel Neves <sneves@dei.uc.pt> | 2020-01-23 12:17:43 +0000 |
commit | a830ab266127d765f808bc708d9f67c62b0bd6d2 (patch) | |
tree | e64eafe473a0ffba1cbc06f4455ede56f7f09f91 /c | |
parent | de1458c565aad524386781793e33d64d45577368 (diff) |
streamline load_counters
avx2 before:
mov eax, esi
neg rax
vmovq xmm0, rax
vpbroadcastq ymm0, xmm0
vpand ymm0, ymm0, ymmword ptr [rip + .LCPI1_0]
vmovq xmm2, rdi
vpbroadcastq ymm1, xmm2
vpaddq ymm1, ymm0, ymm1
vmovdqa ymm0, ymmword ptr [rip + .LCPI1_1] # ymm0 = [0,2,4,6,4,6,6,7]
vpermd ymm3, ymm0, ymm1
mov r8d, eax
and r8d, 5
add r8, rdi
mov esi, eax
and esi, 6
add rsi, rdi
and eax, 7
vpshufd xmm4, xmm3, 231 # xmm4 = xmm3[3,1,2,3]
vpinsrd xmm4, xmm4, r8d, 1
add rax, rdi
vpinsrd xmm4, xmm4, esi, 2
vpinsrd xmm4, xmm4, eax, 3
vpshufd xmm3, xmm3, 144 # xmm3 = xmm3[0,0,1,2]
vpinsrd xmm3, xmm3, edi, 0
vmovdqa xmmword ptr [rdx], xmm3
vmovdqa xmmword ptr [rdx + 16], xmm4
vpermq ymm3, ymm1, 144 # ymm3 = ymm1[0,0,1,2]
vpblendd ymm2, ymm3, ymm2, 3 # ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
vpsrlq ymm2, ymm2, 32
vpermd ymm2, ymm0, ymm2
vextracti128 xmm1, ymm1, 1
vmovq xmm3, rax
vmovq xmm4, rsi
vpunpcklqdq xmm3, xmm4, xmm3 # xmm3 = xmm4[0],xmm3[0]
vmovq xmm4, r8
vpalignr xmm1, xmm4, xmm1, 8 # xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
vinserti128 ymm1, ymm1, xmm3, 1
vpsrlq ymm1, ymm1, 32
vpermd ymm0, ymm0, ymm1
avx2 after:
neg esi
vmovd xmm0, esi
vpbroadcastd ymm0, xmm0
vmovd xmm1, edi
vpbroadcastd ymm1, xmm1
vpand ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
vpaddd ymm1, ymm1, ymm0
vpbroadcastd ymm2, dword ptr [rip + .LCPI0_1] # ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
vpor ymm0, ymm0, ymm2
vpxor ymm2, ymm1, ymm2
vpcmpgtd ymm0, ymm0, ymm2
shr rdi, 32
vmovd xmm2, edi
vpbroadcastd ymm2, xmm2
vpsubd ymm0, ymm2, ymm0
Diffstat (limited to 'c')
-rw-r--r-- | c/blake3_avx2.c | 32 | ||||
-rw-r--r-- | c/blake3_avx512.c | 22 | ||||
-rw-r--r-- | c/blake3_sse41.c | 18 |
3 files changed, 30 insertions, 42 deletions
diff --git a/c/blake3_avx2.c b/c/blake3_avx2.c index a524e1a..370be4d 100644 --- a/c/blake3_avx2.c +++ b/c/blake3_avx2.c @@ -19,12 +19,6 @@ INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } -INLINE __m256i set8(uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e, - uint32_t f, uint32_t g, uint32_t h) { - return _mm256_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d, - (int32_t)e, (int32_t)f, (int32_t)g, (int32_t)h); -} - INLINE __m256i rot16(__m256i x) { return _mm256_shuffle_epi8( x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, @@ -32,7 +26,7 @@ INLINE __m256i rot16(__m256i x) { } INLINE __m256i rot12(__m256i x) { - return xorv(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)); + return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)); } INLINE __m256i rot8(__m256i x) { @@ -42,7 +36,7 @@ INLINE __m256i rot8(__m256i x) { } INLINE __m256i rot7(__m256i x) { - return xorv(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)); + return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)); } INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) { @@ -221,18 +215,16 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs, } INLINE void load_counters(uint64_t counter, bool increment_counter, - __m256i *out_low, __m256i *out_high) { - uint64_t mask = (increment_counter ? ~0 : 0); - *out_low = set8( - counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), - counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)), - counter_low(counter + (mask & 4)), counter_low(counter + (mask & 5)), - counter_low(counter + (mask & 6)), counter_low(counter + (mask & 7))); - *out_high = set8( - counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), - counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)), - counter_high(counter + (mask & 4)), counter_high(counter + (mask & 5)), - counter_high(counter + (mask & 6)), counter_high(counter + (mask & 7))); + __m256i *out_lo, __m256i *out_hi) { + const __m256i mask = _mm256_set1_epi32(-(uint32_t)increment_counter); + const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + const __m256i add1 = _mm256_and_si256(mask, add0); + __m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1); + __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), + _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000))); + __m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry); + *out_lo = l; + *out_hi = h; } void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, diff --git a/c/blake3_avx512.c b/c/blake3_avx512.c index c9c136e..6477277 100644 --- a/c/blake3_avx512.c +++ b/c/blake3_avx512.c @@ -1044,20 +1044,14 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs, INLINE void load_counters16(uint64_t counter, bool increment_counter, __m512i *out_lo, __m512i *out_hi) { - uint64_t mask = (increment_counter ? ~0 : 0); - __m512i mask_vec = _mm512_set1_epi64(mask); - __m512i deltas_a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); - deltas_a = _mm512_and_si512(mask_vec, deltas_a); - __m512i deltas_b = _mm512_setr_epi64(8, 9, 10, 11, 12, 13, 14, 15); - deltas_b = _mm512_and_si512(mask_vec, deltas_b); - __m512i a = _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas_a); - __m512i b = _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas_b); - __m512i lo_indexes = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, - 22, 24, 26, 28, 30); - __m512i hi_indexes = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, - 23, 25, 27, 29, 31); - *out_lo = _mm512_permutex2var_epi32(a, lo_indexes, b); - *out_hi = _mm512_permutex2var_epi32(a, hi_indexes, b); + const __m512i mask = _mm512_set1_epi32(-(uint32_t)increment_counter); + const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + const __m512i add1 = _mm512_and_si512(mask, add0); + __m512i l = _mm512_add_epi32(_mm512_set1_epi32(counter), add1); + __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT); + __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32(counter >> 32), carry, _mm512_set1_epi32(counter >> 32), _mm512_set1_epi32(1)); + *out_lo = l; + *out_hi = h; } void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, diff --git a/c/blake3_sse41.c b/c/blake3_sse41.c index 0d62a42..c1b919c 100644 --- a/c/blake3_sse41.c +++ b/c/blake3_sse41.c @@ -438,14 +438,16 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs, } INLINE void load_counters(uint64_t counter, bool increment_counter, - __m128i *out_low, __m128i *out_high) { - uint64_t mask = (increment_counter ? ~0 : 0); - *out_low = set4( - counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), - counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3))); - *out_high = set4( - counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), - counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3))); + __m128i *out_lo, __m128i *out_hi) { + const __m128i mask = _mm_set1_epi32(-(uint32_t)increment_counter); + const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); + const __m128i add1 = _mm_and_si128(mask, add0); + __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1); + __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), + _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); + __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry); + *out_lo = l; + *out_hi = h; } void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, |