diff options
author | Xiaoyun Li <xiaoyun.li@intel.com> | 2017-11-03 20:47:23 +0800 |
---|---|---|
committer | Thomas Monjalon <thomas@monjalon.net> | 2017-11-07 01:16:03 +0100 |
commit | d35cc1fe6a7a1cafb6b70c514bcfb9f0cee9e4b7 (patch) | |
tree | 08ee509887a1991f9956d9f5509b4ea9fe01bf5e /lib/librte_efd | |
parent | e3a64deae2d58307c8f7bea15e6661f5150853d5 (diff) |
eal/x86: revert select optimized memcpy at run-time
Revert the patchset run-time Linking support including the following
3 commits:
Fixes: 84cc318424d4 ("eal/x86: select optimized memcpy at run-time")
Fixes: c7fbc80fe60f ("test: select memcpy alignment unit at run-time")
Fixes: 5f180ae32962 ("efd: move AVX2 lookup in its own compilation unit")
The patchset would cause perf drop in vhost/virtio loopback performance
test. Because the run-time dispatch must cost at least a function call
comparing to the compile-time dispatch. And the reference cpu cycles value
is small. And in the test, when using 128-256 bytes packet, it would cause
16%-20% perf drop with mergeble path. When using 256 bytes packet, it would
cause 13% perf drop with vector path.
Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
Diffstat (limited to 'lib/librte_efd')
-rw-r--r-- | lib/librte_efd/Makefile | 6 | ||||
-rw-r--r-- | lib/librte_efd/rte_efd_x86.c | 77 | ||||
-rw-r--r-- | lib/librte_efd/rte_efd_x86.h | 48 |
3 files changed, 46 insertions, 85 deletions
diff --git a/lib/librte_efd/Makefile b/lib/librte_efd/Makefile index b0467186a4..16e450e8dc 100644 --- a/lib/librte_efd/Makefile +++ b/lib/librte_efd/Makefile @@ -45,12 +45,6 @@ LIBABIVER := 1 # all source are stored in SRCS-y SRCS-$(CONFIG_RTE_LIBRTE_EFD) := rte_efd.c -# if the compiler supports AVX2, add efd x86 file -ifneq ($(findstring CC_SUPPORT_AVX2,$(MACHINE_CFLAGS)),) -SRCS-$(CONFIG_RTE_ARCH_X86) += rte_efd_x86.c -CFLAGS_rte_efd_x86.o += -mavx2 -endif - # install this header file SYMLINK-$(CONFIG_RTE_LIBRTE_EFD)-include := rte_efd.h diff --git a/lib/librte_efd/rte_efd_x86.c b/lib/librte_efd/rte_efd_x86.c deleted file mode 100644 index 49677db075..0000000000 --- a/lib/librte_efd/rte_efd_x86.c +++ /dev/null @@ -1,77 +0,0 @@ -/*- - * BSD LICENSE - * - * Copyright(c) 2016-2017 Intel Corporation. All rights reserved. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* rte_efd_x86.c - * This file holds all x86 specific EFD functions - */ -#include <rte_efd.h> -#include <rte_efd_x86.h> - -#if (RTE_EFD_VALUE_NUM_BITS == 8 || RTE_EFD_VALUE_NUM_BITS == 16 || \ - RTE_EFD_VALUE_NUM_BITS == 24 || RTE_EFD_VALUE_NUM_BITS == 32) -#define EFD_LOAD_SI128(val) _mm_load_si128(val) -#else -#define EFD_LOAD_SI128(val) _mm_lddqu_si128(val) -#endif - -efd_value_t -efd_lookup_internal_avx2(const efd_hashfunc_t *group_hash_idx, - const efd_lookuptbl_t *group_lookup_table, - const uint32_t hash_val_a, const uint32_t hash_val_b) -{ - efd_value_t value = 0; - uint32_t i = 0; - __m256i vhash_val_a = _mm256_set1_epi32(hash_val_a); - __m256i vhash_val_b = _mm256_set1_epi32(hash_val_b); - - for (; i < RTE_EFD_VALUE_NUM_BITS; i += 8) { - __m256i vhash_idx = - _mm256_cvtepu16_epi32(EFD_LOAD_SI128( - (__m128i const *) &group_hash_idx[i])); - __m256i vlookup_table = _mm256_cvtepu16_epi32( - EFD_LOAD_SI128((__m128i const *) - &group_lookup_table[i])); - __m256i vhash = _mm256_add_epi32(vhash_val_a, - _mm256_mullo_epi32(vhash_idx, vhash_val_b)); - __m256i vbucket_idx = _mm256_srli_epi32(vhash, - EFD_LOOKUPTBL_SHIFT); - __m256i vresult = _mm256_srlv_epi32(vlookup_table, - vbucket_idx); - - value |= (_mm256_movemask_ps( - (__m256) _mm256_slli_epi32(vresult, 31)) - & ((1 << (RTE_EFD_VALUE_NUM_BITS - i)) - 1)) << i; - } - - return value; -} diff --git a/lib/librte_efd/rte_efd_x86.h b/lib/librte_efd/rte_efd_x86.h index 7a082aa165..34f37d73b7 100644 --- a/lib/librte_efd/rte_efd_x86.h +++ b/lib/librte_efd/rte_efd_x86.h @@ -36,7 +36,51 @@ */ #include <immintrin.h> -extern efd_value_t +#if (RTE_EFD_VALUE_NUM_BITS == 8 || RTE_EFD_VALUE_NUM_BITS == 16 || \ + RTE_EFD_VALUE_NUM_BITS == 24 || RTE_EFD_VALUE_NUM_BITS == 32) +#define EFD_LOAD_SI128(val) _mm_load_si128(val) +#else +#define EFD_LOAD_SI128(val) _mm_lddqu_si128(val) +#endif + +static inline efd_value_t efd_lookup_internal_avx2(const efd_hashfunc_t *group_hash_idx, const efd_lookuptbl_t *group_lookup_table, - const uint32_t hash_val_a, const uint32_t hash_val_b); + const uint32_t hash_val_a, const uint32_t hash_val_b) +{ +#ifdef RTE_MACHINE_CPUFLAG_AVX2 + efd_value_t value = 0; + uint32_t i = 0; + __m256i vhash_val_a = _mm256_set1_epi32(hash_val_a); + __m256i vhash_val_b = _mm256_set1_epi32(hash_val_b); + + for (; i < RTE_EFD_VALUE_NUM_BITS; i += 8) { + __m256i vhash_idx = + _mm256_cvtepu16_epi32(EFD_LOAD_SI128( + (__m128i const *) &group_hash_idx[i])); + __m256i vlookup_table = _mm256_cvtepu16_epi32( + EFD_LOAD_SI128((__m128i const *) + &group_lookup_table[i])); + __m256i vhash = _mm256_add_epi32(vhash_val_a, + _mm256_mullo_epi32(vhash_idx, vhash_val_b)); + __m256i vbucket_idx = _mm256_srli_epi32(vhash, + EFD_LOOKUPTBL_SHIFT); + __m256i vresult = _mm256_srlv_epi32(vlookup_table, + vbucket_idx); + + value |= (_mm256_movemask_ps( + (__m256) _mm256_slli_epi32(vresult, 31)) + & ((1 << (RTE_EFD_VALUE_NUM_BITS - i)) - 1)) << i; + } + + return value; +#else + RTE_SET_USED(group_hash_idx); + RTE_SET_USED(group_lookup_table); + RTE_SET_USED(hash_val_a); + RTE_SET_USED(hash_val_b); + /* Return dummy value, only to avoid compilation breakage */ + return 0; +#endif + +} |