qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [RFC PATCH v2 1/3] target-arm: Use Neon for zero checki


From: Paolo Bonzini
Subject: Re: [Qemu-devel] [RFC PATCH v2 1/3] target-arm: Use Neon for zero checking
Date: Thu, 7 Apr 2016 06:30:07 -0400 (EDT)

> +#elif defined __aarch64__
> +#include "arm_neon.h"
> +
> +#define NEON_VECTYPE               uint64x2_t
> +#define NEON_LOAD_N_ORR(v1, v2)    (vld1q_u64(&v1) | vld1q_u64(&v2))

Why is the load and orr necessary?  Is ((v1) | (v2)) enough?

> +#define NEON_ORR(v1, v2)           ((v1) | (v2))
> +#define NEON_NOT_EQ_ZERO(v1) \
> +        ((vgetq_lane_u64(v1, 0) != 0) || (vgetq_lane_u64(v1, 1) != 0))
> +
> +#define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON 16

Unless you have numbers saying that a 16-unroll is better than an 8-unroll
(and then you should put those in the commit message), you do not need to
duplicate code, just add aarch64 definitions for the existing code.

---

I've now read the rest of the patches, and you're adding prefetch code
that is ARM-specific.  Please provide numbers separately for each
patch, not just in the cover letter.  The cover letter is lost when the
patch is committed, while the commit messages remain.

On top of this, "With these changes, total migration time comes down
from 10 seconds to 2.5 seconds" is not a reproducible experiment.
What was the RAM size?  Was the guest just booted and idle, or was
there a workload?  What was the host?

Thanks,

Paolo

> +/*
> + * Zero page/buffer checking using SIMD(Neon)
> + */
> +
> +static bool
> +can_use_buffer_find_nonzero_offset_neon(const void *buf, size_t len)
> +{
> +    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON
> +                   * sizeof(NEON_VECTYPE)) == 0
> +            && ((uintptr_t) buf) % sizeof(NEON_VECTYPE) == 0);
> +}
> +
> +static size_t buffer_find_nonzero_offset_neon(const void *buf, size_t len)
> +{
> +    size_t i;
> +    NEON_VECTYPE qword0, qword1, qword2, qword3, qword4, qword5, qword6;
> +    uint64_t const *data = buf;
> +
> +    if (!len) {
> +        return 0;
> +    }
> +
> +    assert(can_use_buffer_find_nonzero_offset_neon(buf, len));
> +    len /= sizeof(unsigned long);
> +
> +    for (i = 0; i < len; i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR_NEON)
> {
> +        qword0 = NEON_LOAD_N_ORR(data[i], data[i + 2]);
> +        qword1 = NEON_LOAD_N_ORR(data[i + 4], data[i + 6]);
> +        qword2 = NEON_LOAD_N_ORR(data[i + 8], data[i + 10]);
> +        qword3 = NEON_LOAD_N_ORR(data[i + 12], data[i + 14]);
> +        qword4 = NEON_ORR(qword0, qword1);
> +        qword5 = NEON_ORR(qword2, qword3);
> +        qword6 = NEON_ORR(qword4, qword5);
> +
> +        if (NEON_NOT_EQ_ZERO(qword6)) {
> +            break;
> +        }
> +    }
> +
> +    return i * sizeof(unsigned long);
> +}
> +
> +static inline bool neon_support(void)
> +{
> +    /*
> +     * Check if neon feature is supported.
> +     * By default neon is supported for aarch64.
> +     */
> +    return true;

Then everything below this function is not necessary.

Paolo

> +}
> +
> +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
> +{
> +    return neon_support() ? can_use_buffer_find_nonzero_offset_neon(buf,
> len) :
> +           can_use_buffer_find_nonzero_offset_inner(buf, len);
> +}
> +
> +size_t buffer_find_nonzero_offset(const void *buf, size_t len)
> +{
> +    return neon_support() ? buffer_find_nonzero_offset_neon(buf, len) :
> +           buffer_find_nonzero_offset_inner(buf, len);
> +}
>  #else
>  bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
>  {
> --
> 1.7.9.5
> 
> 



reply via email to

[Prev in Thread] Current Thread [Next in Thread]