[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH 07/10] virtio: combine the read of a descriptor
From: |
Gonglei (Arei) |
Subject: |
Re: [Qemu-devel] [PATCH 07/10] virtio: combine the read of a descriptor |
Date: |
Wed, 3 Feb 2016 12:34:44 +0000 |
Hi,
> Subject: [Qemu-devel] [PATCH 07/10] virtio: combine the read of a descriptor
>
> Compared to vring, virtio has a performance penalty of 10%. Fix it
> by combining all the reads for a descriptor in a single address_space_read
> call. This also simplifies the code nicely.
>
> Reviewed-by: Cornelia Huck <address@hidden>
> Signed-off-by: Paolo Bonzini <address@hidden>
> ---
> hw/virtio/virtio.c | 86
> ++++++++++++++++++++++--------------------------------
> 1 file changed, 35 insertions(+), 51 deletions(-)
>
Unbelievable! After applying this patch, the virtio-crypto speed can attach
74MB/sec, host
Cpu overhead is 180% (the main thread 100% and vcpu threads 80%)
Testing AES-128-CBC cipher:
Encrypting in chunks of 256 bytes: done. 371.94 MiB in 5.02 secs: 74.12
MiB/sec (1523475 packets)
Encrypting in chunks of 256 bytes: done. 369.85 MiB in 5.01 secs: 73.88
MiB/sec (1514900 packets)
Encrypting in chunks of 256 bytes: done. 371.07 MiB in 5.02 secs: 73.97
MiB/sec (1519914 packets)
Encrypting in chunks of 256 bytes: done. 371.66 MiB in 5.02 secs: 74.09
MiB/sec (1522309 packets)
Encrypting in chunks of 256 bytes: done. 371.79 MiB in 5.02 secs: 74.12
MiB/sec (1522868 packets)
Encrypting in chunks of 256 bytes: done. 371.94 MiB in 5.02 secs: 74.15
MiB/sec (1523457 packets)
Encrypting in chunks of 256 bytes: done. 371.90 MiB in 5.02 secs: 74.14
MiB/sec (1523317 packets)
Encrypting in chunks of 256 bytes: done. 371.71 MiB in 5.02 secs: 74.10
MiB/sec (1522522 packets)
15.95% qemu-kvm [.] address_space_translate
6.98% qemu-kvm [.] qemu_get_ram_ptr
4.87% libpthread-2.19.so [.] __pthread_mutex_unlock_usercnt
4.40% qemu-kvm [.] qemu_ram_addr_from_host
3.79% qemu-kvm [.] address_space_map
3.41% libc-2.19.so [.] _int_malloc
3.29% libc-2.19.so [.] _int_free
3.07% libc-2.19.so [.] malloc
2.95% libpthread-2.19.so [.] pthread_mutex_lock
2.94% qemu-kvm [.] phys_page_find
2.73% qemu-kvm [.] address_space_translate_internal
2.65% libc-2.19.so [.] malloc_consolidate
2.35% libc-2.19.so [.] __memcpy_sse2_unaligned
1.72% qemu-kvm [.] find_next_zero_bit
1.38% qemu-kvm [.] address_space_rw
1.34% qemu-kvm [.] object_unref
1.30% qemu-kvm [.] object_ref
1.28% qemu-kvm [.] virtqueue_pop
1.20% libc-2.19.so [.] memset
1.11% qemu-kvm [.] virtio_notify
Thank you so much!
Regards,
-Gonglei
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 79a635f..2433866 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -107,35 +107,15 @@ void virtio_queue_update_rings(VirtIODevice *vdev,
> int n)
> vring->align);
> }
>
> -static inline uint64_t vring_desc_addr(VirtIODevice *vdev, hwaddr desc_pa,
> - int i)
> +static void vring_desc_read(VirtIODevice *vdev, VRingDesc *desc,
> + hwaddr desc_pa, int i)
> {
> - hwaddr pa;
> - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr);
> - return virtio_ldq_phys(vdev, pa);
> -}
> -
> -static inline uint32_t vring_desc_len(VirtIODevice *vdev, hwaddr desc_pa, int
> i)
> -{
> - hwaddr pa;
> - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len);
> - return virtio_ldl_phys(vdev, pa);
> -}
> -
> -static inline uint16_t vring_desc_flags(VirtIODevice *vdev, hwaddr desc_pa,
> - int i)
> -{
> - hwaddr pa;
> - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags);
> - return virtio_lduw_phys(vdev, pa);
> -}
> -
> -static inline uint16_t vring_desc_next(VirtIODevice *vdev, hwaddr desc_pa,
> - int i)
> -{
> - hwaddr pa;
> - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, next);
> - return virtio_lduw_phys(vdev, pa);
> + address_space_read(&address_space_memory, desc_pa + i *
> sizeof(VRingDesc),
> + MEMTXATTRS_UNSPECIFIED, (void *)desc,
> sizeof(VRingDesc));
> + virtio_tswap64s(vdev, &desc->addr);
> + virtio_tswap32s(vdev, &desc->len);
> + virtio_tswap16s(vdev, &desc->flags);
> + virtio_tswap16s(vdev, &desc->next);
> }
>
> static inline uint16_t vring_avail_flags(VirtQueue *vq)
> @@ -345,18 +325,18 @@ static unsigned int virtqueue_get_head(VirtQueue
> *vq, unsigned int idx)
> return head;
> }
>
> -static unsigned virtqueue_next_desc(VirtIODevice *vdev, hwaddr desc_pa,
> - unsigned int i, unsigned int max)
> +static unsigned virtqueue_read_next_desc(VirtIODevice *vdev, VRingDesc
> *desc,
> + hwaddr desc_pa, unsigned
> int max)
> {
> unsigned int next;
>
> /* If this descriptor says it doesn't chain, we're done. */
> - if (!(vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_NEXT)) {
> + if (!(desc->flags & VRING_DESC_F_NEXT)) {
> return max;
> }
>
> /* Check they're not leading us off end of descriptors. */
> - next = vring_desc_next(vdev, desc_pa, i);
> + next = desc->next;
> /* Make sure compiler knows to grab that: we don't want it changing! */
> smp_wmb();
>
> @@ -365,6 +345,7 @@ static unsigned virtqueue_next_desc(VirtIODevice
> *vdev, hwaddr desc_pa,
> exit(1);
> }
>
> + vring_desc_read(vdev, desc, desc_pa, next);
> return next;
> }
>
> @@ -381,6 +362,7 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
> unsigned int *in_bytes,
> while (virtqueue_num_heads(vq, idx)) {
> VirtIODevice *vdev = vq->vdev;
> unsigned int max, num_bufs, indirect = 0;
> + VRingDesc desc;
> hwaddr desc_pa;
> int i;
>
> @@ -388,9 +370,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
> unsigned int *in_bytes,
> num_bufs = total_bufs;
> i = virtqueue_get_head(vq, idx++);
> desc_pa = vq->vring.desc;
> + vring_desc_read(vdev, &desc, desc_pa, i);
>
> - if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) {
> - if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) {
> + if (desc.flags & VRING_DESC_F_INDIRECT) {
> + if (desc.len % sizeof(VRingDesc)) {
> error_report("Invalid size for indirect buffer table");
> exit(1);
> }
> @@ -403,9 +386,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
> unsigned int *in_bytes,
>
> /* loop over the indirect descriptor table */
> indirect = 1;
> - max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc);
> - desc_pa = vring_desc_addr(vdev, desc_pa, i);
> + max = desc.len / sizeof(VRingDesc);
> + desc_pa = desc.addr;
> num_bufs = i = 0;
> + vring_desc_read(vdev, &desc, desc_pa, i);
> }
>
> do {
> @@ -415,15 +399,15 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
> unsigned int *in_bytes,
> exit(1);
> }
>
> - if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE)
> {
> - in_total += vring_desc_len(vdev, desc_pa, i);
> + if (desc.flags & VRING_DESC_F_WRITE) {
> + in_total += desc.len;
> } else {
> - out_total += vring_desc_len(vdev, desc_pa, i);
> + out_total += desc.len;
> }
> if (in_total >= max_in_bytes && out_total >= max_out_bytes) {
> goto done;
> }
> - } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max);
> + } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa,
> max)) != max);
>
> if (!indirect)
> total_bufs = num_bufs;
> @@ -545,6 +529,7 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz)
> unsigned out_num, in_num;
> hwaddr addr[VIRTQUEUE_MAX_SIZE];
> struct iovec iov[VIRTQUEUE_MAX_SIZE];
> + VRingDesc desc;
>
> if (!virtqueue_num_heads(vq, vq->last_avail_idx)) {
> return NULL;
> @@ -560,33 +545,32 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz)
> vring_set_avail_event(vq, vq->last_avail_idx);
> }
>
> - if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) {
> - if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) {
> + vring_desc_read(vdev, &desc, desc_pa, i);
> + if (desc.flags & VRING_DESC_F_INDIRECT) {
> + if (desc.len % sizeof(VRingDesc)) {
> error_report("Invalid size for indirect buffer table");
> exit(1);
> }
>
> /* loop over the indirect descriptor table */
> - max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc);
> - desc_pa = vring_desc_addr(vdev, desc_pa, i);
> + max = desc.len / sizeof(VRingDesc);
> + desc_pa = desc.addr;
> i = 0;
> + vring_desc_read(vdev, &desc, desc_pa, i);
> }
>
> /* Collect all the descriptors */
> do {
> - hwaddr pa = vring_desc_addr(vdev, desc_pa, i);
> - size_t len = vring_desc_len(vdev, desc_pa, i);
> -
> - if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE) {
> + if (desc.flags & VRING_DESC_F_WRITE) {
> virtqueue_map_desc(&in_num, addr + out_num, iov +
> out_num,
> - VIRTQUEUE_MAX_SIZE - out_num, true,
> pa, len);
> + VIRTQUEUE_MAX_SIZE - out_num, true,
> desc.addr, desc.len);
> } else {
> if (in_num) {
> error_report("Incorrect order for descriptors");
> exit(1);
> }
> virtqueue_map_desc(&out_num, addr, iov,
> - VIRTQUEUE_MAX_SIZE, false, pa, len);
> + VIRTQUEUE_MAX_SIZE, false,
> desc.addr, desc.len);
> }
>
> /* If we've got too many, that implies a descriptor loop. */
> @@ -594,7 +578,7 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz)
> error_report("Looped descriptor");
> exit(1);
> }
> - } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max);
> + } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa, max)) !=
> max);
>
> /* Now copy what we have collected and mapped */
> elem = virtqueue_alloc_element(sz, out_num, in_num);
> --
> 2.5.0
>
>
- Re: [Qemu-devel] [PATCH 07/10] virtio: combine the read of a descriptor,
Gonglei (Arei) <=