[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH v1 01/14] tests: add fp-bench, a collection of s
From: |
Alex Bennée |
Subject: |
Re: [Qemu-devel] [PATCH v1 01/14] tests: add fp-bench, a collection of simple floating-point microbenchmarks |
Date: |
Tue, 27 Mar 2018 09:45:14 +0100 |
User-agent: |
mu4e 1.1.0; emacs 26.0.91 |
Emilio G. Cota <address@hidden> writes:
> This will allow us to measure the performance impact of FP
> emulation optimizations.
>
> Signed-off-by: Emilio G. Cota <address@hidden>
> ---
> tests/fp-bench.c | 290
> +++++++++++++++++++++++++++++++++++++++++++++++++
> tests/.gitignore | 1 +
> tests/Makefile.include | 3 +-
> 3 files changed, 293 insertions(+), 1 deletion(-)
> create mode 100644 tests/fp-bench.c
>
> diff --git a/tests/fp-bench.c b/tests/fp-bench.c
> new file mode 100644
> index 0000000..a782093
> --- /dev/null
> +++ b/tests/fp-bench.c
> @@ -0,0 +1,290 @@
> +/*
> + * fp-bench.c - A collection of simple floating point microbenchmarks.
> + *
> + * Copyright (C) 2018, Emilio G. Cota <address@hidden>
> + *
> + * License: GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +#include "qemu/osdep.h"
> +#include "qemu/atomic.h"
> +
> +#include <math.h>
> +
> +#include <sys/time.h>
> +#include <stdint.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <stdio.h>
> +#include <time.h>
> +
> +/* amortize the computation of random inputs */
> +#define OPS_PER_ITER (1000ULL)
> +
> +#define SEED_A 0xdeadfacedeadface
> +#define SEED_B 0xbadc0feebadc0fee
> +#define SEED_C 0xbeefdeadbeefdead
> +
> +enum op {
> + OP_ADD,
> + OP_SUB,
> + OP_MUL,
> + OP_DIV,
> + OP_FMA,
> + OP_SQRT,
> +};
> +
> +static const char * const op_names[] = {
> + [OP_ADD] = "add",
> + [OP_SUB] = "sub",
> + [OP_MUL] = "mul",
> + [OP_DIV] = "div",
> + [OP_FMA] = "fma",
> + [OP_SQRT] = "sqrt",
> +};
> +
> +static uint64_t n_ops = 10000000;
> +static enum op op;
> +static const char *precision = "float";
> +
> +static const char commands_string[] =
> + " -n = number of floating point operations\n"
> + " -o = floating point operation (add, sub, mul, div, fma, sqrt).
> Default: add\n"
> + " -p = precision (float|single, double). Default: float";
> +
> +static void usage_complete(int argc, char *argv[])
> +{
> + fprintf(stderr, "Usage: %s [options]\n", argv[0]);
> + fprintf(stderr, "options:\n%s\n", commands_string);
> + exit(-1);
> +}
> +
> +static void set_op(const char *name)
> +{
> + int i;
> +
> + for (i = 0; i < ARRAY_SIZE(op_names); i++) {
> + if (strcmp(name, op_names[i]) == 0) {
> + op = i;
> + return;
> + }
> + }
> + fprintf(stderr, "Unsupported op '%s'\n", name);
> + exit(EXIT_FAILURE);
> +}
> +
> +static inline int64_t get_clock_realtime(void)
> +{
> + struct timeval tv;
> +
> + gettimeofday(&tv, NULL);
> + return tv.tv_sec * 1000000000LL + (tv.tv_usec * 1000);
> +}
> +
> +/*
> + * From: https://en.wikipedia.org/wiki/Xorshift
> + * This is faster than rand_r(), and gives us a wider range (RAND_MAX is only
> + * guaranteed to be >= INT_MAX).
> + */
> +static uint64_t xorshift64star(uint64_t x)
> +{
> + x ^= x >> 12; /* a */
> + x ^= x << 25; /* b */
> + x ^= x >> 27; /* c */
> + return x * UINT64_C(2685821657736338717);
> +}
> +
> +static inline bool u32_is_normal(uint32_t x)
> +{
> + return ((x + 0x00800000) & 0x7fffffff) >= 0x01000000;
> +}
> +
> +static inline bool u64_is_normal(uint64_t x)
> +{
> + return ((x + (1ULL << 52)) & -1ULL >> 1) >= 1ULL << 53;
> +}
> +
> +static inline float get_random_float(uint64_t *x)
> +{
> + uint64_t r = *x;
> + uint32_t r32;
> +
> + do {
> + r = xorshift64star(r);
> + } while (!u32_is_normal(r));
> + *x = r;
> + r32 = r;
> + return *(float *)&r32;
> +}
> +
> +static inline double get_random_double(uint64_t *x)
> +{
> + uint64_t r = *x;
> +
> + do {
> + r = xorshift64star(r);
> + } while (!u64_is_normal(r));
> + *x = r;
> + return *(double *)&r;
> +}
> +
> +/*
> + * Disable optimizations (e.g. "a OP b" outside of the inner loop) with
> + * volatile.
> + */
> +#define GEN_BENCH_1OPF(NAME, FUNC, PRECISION) \
> + static void NAME(volatile PRECISION *res) \
> + { \
> + uint64_t ra = SEED_A; \
> + uint64_t i, j; \
> + \
> + for (i = 0; i < n_ops; i += OPS_PER_ITER) { \
> + volatile PRECISION a = glue(get_random_, PRECISION)(&ra); \
> + \
> + for (j = 0; j < OPS_PER_ITER; j++) { \
> + *res = FUNC(a); \
> + } \
> + } \
> + }
> +
Have you had a chance to look at if this will vectorise? I have a
similar benchmark which I compile with multiple options to test normal,
NEON/AdvSIMD and SVE enabled loops.
> +GEN_BENCH_1OPF(bench_float_sqrt, sqrtf, float)
> +GEN_BENCH_1OPF(bench_double_sqrt, sqrt, double)
> +#undef GEN_BENCH_1OPF
> +
> +#define GEN_BENCH_2OP(NAME, OP, PRECISION) \
> + static void NAME(volatile PRECISION *res) \
> + { \
> + uint64_t ra = SEED_A; \
> + uint64_t rb = SEED_B; \
> + uint64_t i, j; \
> + \
> + for (i = 0; i < n_ops; i += OPS_PER_ITER) { \
> + volatile PRECISION a = glue(get_random_, PRECISION)(&ra); \
> + volatile PRECISION b = glue(get_random_, PRECISION)(&rb); \
> + \
> + for (j = 0; j < OPS_PER_ITER; j++) { \
> + *res = a OP b; \
> + } \
> + } \
> + }
> +
> +GEN_BENCH_2OP(bench_float_add, +, float)
> +GEN_BENCH_2OP(bench_float_sub, -, float)
> +GEN_BENCH_2OP(bench_float_mul, *, float)
> +GEN_BENCH_2OP(bench_float_div, /, float)
> +
> +GEN_BENCH_2OP(bench_double_add, +, double)
> +GEN_BENCH_2OP(bench_double_sub, -, double)
> +GEN_BENCH_2OP(bench_double_mul, *, double)
> +GEN_BENCH_2OP(bench_double_div, /, double)
> +
> +#define GEN_BENCH_3OPF(NAME, FUNC, PRECISION) \
> + static void NAME(volatile PRECISION *res) \
> + { \
> + uint64_t ra = SEED_A; \
> + uint64_t rb = SEED_B; \
> + uint64_t rc = SEED_C; \
> + uint64_t i, j; \
> + \
> + for (i = 0; i < n_ops; i += OPS_PER_ITER) { \
> + volatile PRECISION a = glue(get_random_, PRECISION)(&ra); \
> + volatile PRECISION b = glue(get_random_, PRECISION)(&rb); \
> + volatile PRECISION c = glue(get_random_, PRECISION)(&rc); \
> + \
> + for (j = 0; j < OPS_PER_ITER; j++) { \
> + *res = FUNC(a, b, c); \
> + } \
> + } \
> + }
> +
> +GEN_BENCH_3OPF(bench_float_fma, fmaf, float)
> +GEN_BENCH_3OPF(bench_double_fma, fma, double)
> +#undef GEN_BENCH_3OPF
> +
> +static void parse_args(int argc, char *argv[])
> +{
> + int c;
> +
> + for (;;) {
> + c = getopt(argc, argv, "n:ho:p:");
> + if (c < 0) {
> + break;
> + }
> + switch (c) {
> + case 'h':
> + usage_complete(argc, argv);
> + exit(0);
> + case 'n':
> + n_ops = atoll(optarg);
> + if (n_ops < OPS_PER_ITER) {
> + n_ops = OPS_PER_ITER;
> + }
> + n_ops -= n_ops % OPS_PER_ITER;
> + break;
> + case 'o':
> + set_op(optarg);
> + break;
> + case 'p':
> + precision = optarg;
> + if (strcmp(precision, "float") &&
> + strcmp(precision, "single") &&
> + strcmp(precision, "double")) {
> + fprintf(stderr, "Unsupported precision '%s'\n", precision);
> + exit(EXIT_FAILURE);
Supporting half-precision if the compiler does would also be useful here.
> + }
> + break;
> + }
> + }
> +}
> +
> +#define CALL_BENCH(OP, PRECISION, RESP) \
> + do { \
> + switch (OP) { \
> + case OP_ADD: \
> + glue(glue(bench_, PRECISION), _add)(RESP); \
> + break; \
> + case OP_SUB: \
> + glue(glue(bench_, PRECISION), _sub)(RESP); \
> + break; \
> + case OP_MUL: \
> + glue(glue(bench_, PRECISION), _mul)(RESP); \
> + break; \
> + case OP_DIV: \
> + glue(glue(bench_, PRECISION), _div)(RESP); \
> + break; \
> + case OP_FMA: \
> + glue(glue(bench_, PRECISION), _fma)(RESP); \
> + break; \
> + case OP_SQRT: \
> + glue(glue(bench_, PRECISION), _sqrt)(RESP); \
> + break; \
> + default: \
> + g_assert_not_reached(); \
> + } \
> + } while (0)
> +
> +int main(int argc, char *argv[])
> +{
> + int64_t t0, t1;
> + double resd;
> +
> + parse_args(argc, argv);
> + if (!strcmp(precision, "float") || !strcmp(precision, "single")) {
> + float res;
> + t0 = get_clock_realtime();
> + CALL_BENCH(op, float, &res);
> + t1 = get_clock_realtime();
> + resd = res;
> + } else if (!strcmp(precision, "double")) {
> + t0 = get_clock_realtime();
> + CALL_BENCH(op, double, &resd);
> + t1 = get_clock_realtime();
> + } else {
> + g_assert_not_reached();
> + }
> + printf("%.2f MFlops\n", (double)n_ops / (t1 - t0) * 1e3);
> + if (resd) {
> + return 0;
> + }
> + return 0;
> +}
> diff --git a/tests/.gitignore b/tests/.gitignore
> index 18e58b2..df69175 100644
> --- a/tests/.gitignore
> +++ b/tests/.gitignore
> @@ -12,6 +12,7 @@ check-qobject
> check-qstring
> check-qom-interface
> check-qom-proplist
> +fp-bench
> qht-bench
> rcutorture
> test-aio
> diff --git a/tests/Makefile.include b/tests/Makefile.include
> index ef9b88c..f6121ee 100644
> --- a/tests/Makefile.include
> +++ b/tests/Makefile.include
> @@ -587,7 +587,7 @@ test-obj-y = tests/check-qnum.o tests/check-qstring.o
> tests/check-qdict.o \
> tests/rcutorture.o tests/test-rcu-list.o \
> tests/test-qdist.o tests/test-shift128.o \
> tests/test-qht.o tests/qht-bench.o tests/test-qht-par.o \
> - tests/atomic_add-bench.o
> + tests/atomic_add-bench.o tests/fp-bench.o
Not sure why but "make check" didn't build this. I had to explicitly
"make tests/fp-bench". I guess along with atomic_add_bench though these
are explicitly guest facing tests so maybe we should move them once
tests/tcg is working again. I'll have another run at that this week.
>
> $(test-obj-y): QEMU_INCLUDES += -Itests
> QEMU_CFLAGS += -I$(SRC_PATH)/tests
> @@ -639,6 +639,7 @@ tests/test-qht-par$(EXESUF): tests/test-qht-par.o
> tests/qht-bench$(EXESUF) $(tes
> tests/qht-bench$(EXESUF): tests/qht-bench.o $(test-util-obj-y)
> tests/test-bufferiszero$(EXESUF): tests/test-bufferiszero.o
> $(test-util-obj-y)
> tests/atomic_add-bench$(EXESUF): tests/atomic_add-bench.o $(test-util-obj-y)
> +tests/fp-bench$(EXESUF): tests/fp-bench.o $(test-util-obj-y)
>
> tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \
> hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\
Anyway for this version:
Reviewed-by: Alex Bennée <address@hidden>
--
Alex Bennée
- Re: [Qemu-devel] [PATCH v1 08/14] hostfloat: support float32/64 addition and subtraction, (continued)
[Qemu-devel] [PATCH v1 09/14] hostfloat: support float32/64 multiplication, Emilio G. Cota, 2018/03/21
[Qemu-devel] [PATCH v1 01/14] tests: add fp-bench, a collection of simple floating-point microbenchmarks, Emilio G. Cota, 2018/03/21
- Re: [Qemu-devel] [PATCH v1 01/14] tests: add fp-bench, a collection of simple floating-point microbenchmarks,
Alex Bennée <=
[Qemu-devel] [PATCH v1 11/14] hostfloat: support float32/64 fused multiply-add, Emilio G. Cota, 2018/03/21
[Qemu-devel] [PATCH v1 10/14] hostfloat: support float32/64 division, Emilio G. Cota, 2018/03/21
[Qemu-devel] [PATCH v1 12/14] hostfloat: support float32/64 square root, Emilio G. Cota, 2018/03/21
[Qemu-devel] [PATCH v1 05/14] softfloat: add float32_is_normal and float64_is_normal, Emilio G. Cota, 2018/03/21
[Qemu-devel] [PATCH v1 13/14] hostfloat: support float32/64 comparison, Emilio G. Cota, 2018/03/21