From a830ab266127d765f808bc708d9f67c62b0bd6d2 Mon Sep 17 00:00:00 2001 From: Samuel Neves Date: Thu, 23 Jan 2020 11:58:36 +0000 Subject: streamline load_counters avx2 before: mov eax, esi neg rax vmovq xmm0, rax vpbroadcastq ymm0, xmm0 vpand ymm0, ymm0, ymmword ptr [rip + .LCPI1_0] vmovq xmm2, rdi vpbroadcastq ymm1, xmm2 vpaddq ymm1, ymm0, ymm1 vmovdqa ymm0, ymmword ptr [rip + .LCPI1_1] # ymm0 = [0,2,4,6,4,6,6,7] vpermd ymm3, ymm0, ymm1 mov r8d, eax and r8d, 5 add r8, rdi mov esi, eax and esi, 6 add rsi, rdi and eax, 7 vpshufd xmm4, xmm3, 231 # xmm4 = xmm3[3,1,2,3] vpinsrd xmm4, xmm4, r8d, 1 add rax, rdi vpinsrd xmm4, xmm4, esi, 2 vpinsrd xmm4, xmm4, eax, 3 vpshufd xmm3, xmm3, 144 # xmm3 = xmm3[0,0,1,2] vpinsrd xmm3, xmm3, edi, 0 vmovdqa xmmword ptr [rdx], xmm3 vmovdqa xmmword ptr [rdx + 16], xmm4 vpermq ymm3, ymm1, 144 # ymm3 = ymm1[0,0,1,2] vpblendd ymm2, ymm3, ymm2, 3 # ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] vpsrlq ymm2, ymm2, 32 vpermd ymm2, ymm0, ymm2 vextracti128 xmm1, ymm1, 1 vmovq xmm3, rax vmovq xmm4, rsi vpunpcklqdq xmm3, xmm4, xmm3 # xmm3 = xmm4[0],xmm3[0] vmovq xmm4, r8 vpalignr xmm1, xmm4, xmm1, 8 # xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7] vinserti128 ymm1, ymm1, xmm3, 1 vpsrlq ymm1, ymm1, 32 vpermd ymm0, ymm0, ymm1 avx2 after: neg esi vmovd xmm0, esi vpbroadcastd ymm0, xmm0 vmovd xmm1, edi vpbroadcastd ymm1, xmm1 vpand ymm0, ymm0, ymmword ptr [rip + .LCPI0_0] vpaddd ymm1, ymm1, ymm0 vpbroadcastd ymm2, dword ptr [rip + .LCPI0_1] # ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] vpor ymm0, ymm0, ymm2 vpxor ymm2, ymm1, ymm2 vpcmpgtd ymm0, ymm0, ymm2 shr rdi, 32 vmovd xmm2, edi vpbroadcastd ymm2, xmm2 vpsubd ymm0, ymm2, ymm0 --- c/blake3_avx2.c | 32 ++++++++++++-------------------- c/blake3_avx512.c | 22 ++++++++-------------- c/blake3_sse41.c | 18 ++++++++++-------- 3 files changed, 30 insertions(+), 42 deletions(-) (limited to 'c') diff --git a/c/blake3_avx2.c b/c/blake3_avx2.c index a524e1a..370be4d 100644 --- a/c/blake3_avx2.c +++ b/c/blake3_avx2.c @@ -19,12 +19,6 @@ INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); } INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); } -INLINE __m256i set8(uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e, - uint32_t f, uint32_t g, uint32_t h) { - return _mm256_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d, - (int32_t)e, (int32_t)f, (int32_t)g, (int32_t)h); -} - INLINE __m256i rot16(__m256i x) { return _mm256_shuffle_epi8( x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, @@ -32,7 +26,7 @@ INLINE __m256i rot16(__m256i x) { } INLINE __m256i rot12(__m256i x) { - return xorv(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)); + return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)); } INLINE __m256i rot8(__m256i x) { @@ -42,7 +36,7 @@ INLINE __m256i rot8(__m256i x) { } INLINE __m256i rot7(__m256i x) { - return xorv(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)); + return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)); } INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) { @@ -221,18 +215,16 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs, } INLINE void load_counters(uint64_t counter, bool increment_counter, - __m256i *out_low, __m256i *out_high) { - uint64_t mask = (increment_counter ? ~0 : 0); - *out_low = set8( - counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), - counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)), - counter_low(counter + (mask & 4)), counter_low(counter + (mask & 5)), - counter_low(counter + (mask & 6)), counter_low(counter + (mask & 7))); - *out_high = set8( - counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), - counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)), - counter_high(counter + (mask & 4)), counter_high(counter + (mask & 5)), - counter_high(counter + (mask & 6)), counter_high(counter + (mask & 7))); + __m256i *out_lo, __m256i *out_hi) { + const __m256i mask = _mm256_set1_epi32(-(uint32_t)increment_counter); + const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + const __m256i add1 = _mm256_and_si256(mask, add0); + __m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1); + __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), + _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000))); + __m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry); + *out_lo = l; + *out_hi = h; } void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, diff --git a/c/blake3_avx512.c b/c/blake3_avx512.c index c9c136e..6477277 100644 --- a/c/blake3_avx512.c +++ b/c/blake3_avx512.c @@ -1044,20 +1044,14 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs, INLINE void load_counters16(uint64_t counter, bool increment_counter, __m512i *out_lo, __m512i *out_hi) { - uint64_t mask = (increment_counter ? ~0 : 0); - __m512i mask_vec = _mm512_set1_epi64(mask); - __m512i deltas_a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); - deltas_a = _mm512_and_si512(mask_vec, deltas_a); - __m512i deltas_b = _mm512_setr_epi64(8, 9, 10, 11, 12, 13, 14, 15); - deltas_b = _mm512_and_si512(mask_vec, deltas_b); - __m512i a = _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas_a); - __m512i b = _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas_b); - __m512i lo_indexes = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, - 22, 24, 26, 28, 30); - __m512i hi_indexes = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, - 23, 25, 27, 29, 31); - *out_lo = _mm512_permutex2var_epi32(a, lo_indexes, b); - *out_hi = _mm512_permutex2var_epi32(a, hi_indexes, b); + const __m512i mask = _mm512_set1_epi32(-(uint32_t)increment_counter); + const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + const __m512i add1 = _mm512_and_si512(mask, add0); + __m512i l = _mm512_add_epi32(_mm512_set1_epi32(counter), add1); + __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT); + __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32(counter >> 32), carry, _mm512_set1_epi32(counter >> 32), _mm512_set1_epi32(1)); + *out_lo = l; + *out_hi = h; } void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks, diff --git a/c/blake3_sse41.c b/c/blake3_sse41.c index 0d62a42..c1b919c 100644 --- a/c/blake3_sse41.c +++ b/c/blake3_sse41.c @@ -438,14 +438,16 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs, } INLINE void load_counters(uint64_t counter, bool increment_counter, - __m128i *out_low, __m128i *out_high) { - uint64_t mask = (increment_counter ? ~0 : 0); - *out_low = set4( - counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), - counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3))); - *out_high = set4( - counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), - counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3))); + __m128i *out_lo, __m128i *out_hi) { + const __m128i mask = _mm_set1_epi32(-(uint32_t)increment_counter); + const __m128i add0 = _mm_set_epi32(3, 2, 1, 0); + const __m128i add1 = _mm_and_si128(mask, add0); + __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1); + __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)), + _mm_xor_si128( l, _mm_set1_epi32(0x80000000))); + __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry); + *out_lo = l; + *out_hi = h; } void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, -- cgit v1.2.3-70-g09d2