[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PATCH 10/10] cutils: Rewrite x86 buffer zero checking
From: |
Paolo Bonzini |
Subject: |
[Qemu-devel] [PATCH 10/10] cutils: Rewrite x86 buffer zero checking |
Date: |
Tue, 13 Sep 2016 18:10:05 +0200 |
From: Richard Henderson <address@hidden>
Handle alignment of buffers, so that the vector paths can be
used more often. Add versions for AVX1 and SSE4.1, both of
which have incremental improvements over SSE2.
Signed-off-by: Richard Henderson <address@hidden>
Signed-off-by: Paolo Bonzini <address@hidden>
---
util/bufferiszero.c | 139 ++++++++++++++++++++++++++++++++++++----------------
1 file changed, 96 insertions(+), 43 deletions(-)
diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index 4d8a8c8..c23589c 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c
@@ -26,38 +26,6 @@
#include "qemu/cutils.h"
#include "qemu/bswap.h"
-
-/* vector definitions */
-
-extern void link_error(void);
-
-#define ACCEL_BUFFER_ZERO(NAME, SIZE, VECTYPE, NONZERO) \
-static bool NAME(const void *buf, size_t len) \
-{ \
- const void *end = buf + len; \
- do { \
- const VECTYPE *p = buf; \
- VECTYPE t; \
- __builtin_prefetch(buf + SIZE); \
- barrier(); \
- if (SIZE == sizeof(VECTYPE) * 4) { \
- t = (p[0] | p[1]) | (p[2] | p[3]); \
- } else if (SIZE == sizeof(VECTYPE) * 8) { \
- t = p[0] | p[1]; \
- t |= p[2] | p[3]; \
- t |= p[4] | p[5]; \
- t |= p[6] | p[7]; \
- } else { \
- link_error(); \
- } \
- if (unlikely(NONZERO(t))) { \
- return false; \
- } \
- buf += SIZE; \
- } while (buf < end); \
- return true; \
-}
-
static bool
buffer_zero_int(const void *buf, size_t len)
{
@@ -102,24 +70,110 @@ buffer_zero_int(const void *buf, size_t len)
#pragma GCC push_options
#pragma GCC target("sse2")
#include <emmintrin.h>
-#define SSE2_NONZERO(X) \
- (_mm_movemask_epi8(_mm_cmpeq_epi8((X), _mm_setzero_si128())) != 0xFFFF)
-ACCEL_BUFFER_ZERO(buffer_zero_sse2, 64, __m128i, SSE2_NONZERO)
+
+static bool
+buffer_zero_sse2(const void *buf, size_t len)
+{
+ __m128i t = _mm_loadu_si128(buf);
+ __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
+ __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
+ __m128i zero = _mm_setzero_si128();
+
+ /* Loop over 16-byte aligned blocks of 64. */
+ while (likely(p <= e)) {
+ __builtin_prefetch(p);
+ t = _mm_cmpeq_epi8(t, zero);
+ if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) {
+ return false;
+ }
+ t = p[-4] | p[-3] | p[-2] | p[-1];
+ p += 4;
+ }
+
+ /* Finish the aligned tail. */
+ t |= e[-3];
+ t |= e[-2];
+ t |= e[-1];
+
+ /* Finish the unaligned tail. */
+ t |= _mm_loadu_si128(buf + len - 16);
+
+ return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF;
+}
#pragma GCC pop_options
#ifdef CONFIG_AVX2_OPT
#pragma GCC push_options
#pragma GCC target("sse4")
#include <smmintrin.h>
-#define SSE4_NONZERO(X) !_mm_testz_si128((X), (X))
-ACCEL_BUFFER_ZERO(buffer_zero_sse4, 64, __m128i, SSE4_NONZERO)
+
+static bool
+buffer_zero_sse4(const void *buf, size_t len)
+{
+ __m128i t = _mm_loadu_si128(buf);
+ __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
+ __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
+
+ /* Loop over 16-byte aligned blocks of 64. */
+ while (likely(p <= e)) {
+ __builtin_prefetch(p);
+ if (unlikely(!_mm_testz_si128(t, t))) {
+ return false;
+ }
+ t = p[-4] | p[-3] | p[-2] | p[-1];
+ p += 4;
+ }
+
+ /* Finish the aligned tail. */
+ t |= e[-3];
+ t |= e[-2];
+ t |= e[-1];
+
+ /* Finish the unaligned tail. */
+ t |= _mm_loadu_si128(buf + len - 16);
+
+ return _mm_testz_si128(t, t);
+}
#pragma GCC pop_options
#pragma GCC push_options
#pragma GCC target("avx2")
#include <immintrin.h>
-#define AVX2_NONZERO(X) !_mm256_testz_si256((X), (X))
-ACCEL_BUFFER_ZERO(buffer_zero_avx2, 128, __m256i, AVX2_NONZERO)
+
+static bool
+buffer_zero_avx2(const void *buf, size_t len)
+{
+ /* Begin with an unaligned head of 32 bytes. */
+ __m256i t = _mm256_loadu_si256(buf);
+ __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32);
+ __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32);
+
+ if (likely(p <= e)) {
+ /* Loop over 32-byte aligned blocks of 128. */
+ do {
+ __builtin_prefetch(p);
+ if (unlikely(!_mm256_testz_si256(t, t))) {
+ return false;
+ }
+ t = p[-4] | p[-3] | p[-2] | p[-1];
+ p += 4;
+ } while (p <= e);
+ } else {
+ t |= _mm256_loadu_si256(buf + 32);
+ if (len <= 128) {
+ goto last2;
+ }
+ }
+
+ /* Finish the last block of 128 unaligned. */
+ t |= _mm256_loadu_si256(buf + len - 4 * 32);
+ t |= _mm256_loadu_si256(buf + len - 3 * 32);
+ last2:
+ t |= _mm256_loadu_si256(buf + len - 2 * 32);
+ t |= _mm256_loadu_si256(buf + len - 1 * 32);
+
+ return _mm256_testz_si256(t, t);
+}
#pragma GCC pop_options
#endif
@@ -177,16 +231,15 @@ bool test_buffer_is_zero_next_accel(void)
static bool select_accel_fn(const void *buf, size_t len)
{
- uintptr_t ibuf = (uintptr_t)buf;
#ifdef CONFIG_AVX2_OPT
- if (len % 128 == 0 && ibuf % 32 == 0 && (cpuid_cache & CACHE_AVX2)) {
+ if (len >= 128 && (cpuid_cache & CACHE_AVX2)) {
return buffer_zero_avx2(buf, len);
}
- if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE4)) {
+ if (len >= 64 && (cpuid_cache & CACHE_SSE4)) {
return buffer_zero_sse4(buf, len);
}
#endif
- if (len % 64 == 0 && ibuf % 16 == 0 && (cpuid_cache & CACHE_SSE2)) {
+ if (len >= 64 && (cpuid_cache & CACHE_SSE2)) {
return buffer_zero_sse2(buf, len);
}
return buffer_zero_int(buf, len);
--
1.8.3.1
- [Qemu-devel] [PATCH v4 00/10] Improve buffer_is_zero, Paolo Bonzini, 2016/09/13
- [Qemu-devel] [PATCH 01/10] cutils: Move buffer_is_zero and subroutines to a new file, Paolo Bonzini, 2016/09/13
- [Qemu-devel] [PATCH 04/10] cutils: Rearrange buffer_is_zero acceleration, Paolo Bonzini, 2016/09/13
- [Qemu-devel] [PATCH 03/10] cutils: Export only buffer_is_zero, Paolo Bonzini, 2016/09/13
- [Qemu-devel] [PATCH 05/10] cutils: Remove aarch64 buffer zero checking, Paolo Bonzini, 2016/09/13
- [Qemu-devel] [PATCH 02/10] cutils: Remove SPLAT macro, Paolo Bonzini, 2016/09/13
- [Qemu-devel] [PATCH 07/10] cutils: Add test for buffer_is_zero, Paolo Bonzini, 2016/09/13
- [Qemu-devel] [PATCH 06/10] cutils: Remove ppc buffer zero checking, Paolo Bonzini, 2016/09/13
- [Qemu-devel] [PATCH 08/10] cutils: Add SSE4 version, Paolo Bonzini, 2016/09/13
- [Qemu-devel] [PATCH 09/10] cutils: Add generic prefetch, Paolo Bonzini, 2016/09/13
- [Qemu-devel] [PATCH 10/10] cutils: Rewrite x86 buffer zero checking,
Paolo Bonzini <=