[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH] main-loop: Use epoll on Linux
From: |
Fam Zheng |
Subject: |
Re: [Qemu-devel] [PATCH] main-loop: Use epoll on Linux |
Date: |
Mon, 29 Sep 2014 17:17:48 +0800 |
User-agent: |
Mutt/1.5.23 (2014-03-12) |
On Mon, 09/29 13:26, Fam Zheng wrote:
> A new implementation for qemu_poll_ns based on epoll is introduced here
> to address the slowness of g_poll and ppoll when the number of fds are
> high.
>
> On my laptop this would reduce the virtio-blk on top of null-aio
> device's response time from 32 us to 29 us with few fds (~10), and 48 us
> to 32 us with more fds (for example when virtio-serial is plugged and
> ~64 more io handlers are enabled).
>
> Signed-off-by: Fam Zheng <address@hidden>
> ---
> Makefile.objs | 1 +
> include/qemu/main-loop.h | 1 +
> qemu-epoll.c | 165
> +++++++++++++++++++++++++++++++++++++++++++++++
> qemu-timer.c | 4 +-
> tests/Makefile | 2 +-
> 5 files changed, 171 insertions(+), 2 deletions(-)
> create mode 100644 qemu-epoll.c
>
> diff --git a/Makefile.objs b/Makefile.objs
> index 97db978..52ee086 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -9,6 +9,7 @@ util-obj-y = util/ qobject/ qapi/ qapi-types.o qapi-visit.o
> qapi-event.o
> block-obj-y = async.o thread-pool.o
> block-obj-y += nbd.o block.o blockjob.o
> block-obj-y += main-loop.o iohandler.o qemu-timer.o
> +block-obj-$(CONFIG_LINUX) += qemu-epoll.o
> block-obj-$(CONFIG_POSIX) += aio-posix.o
> block-obj-$(CONFIG_WIN32) += aio-win32.o
> block-obj-y += block/
> diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
> index 62c68c0..eb01b95 100644
> --- a/include/qemu/main-loop.h
> +++ b/include/qemu/main-loop.h
> @@ -307,5 +307,6 @@ void qemu_iohandler_poll(GArray *pollfds, int rc);
>
> QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque);
> void qemu_bh_schedule_idle(QEMUBH *bh);
> +int qemu_epoll(GPollFD *fds, guint nfds, int64_t timeout);
>
> #endif
> diff --git a/qemu-epoll.c b/qemu-epoll.c
> new file mode 100644
> index 0000000..89ec12a
> --- /dev/null
> +++ b/qemu-epoll.c
> @@ -0,0 +1,165 @@
> +/*
> + * QEMU Event Loop
> + *
> + * Copyright (c) 2014 Red Hat, Inc.
> + *
> + * Authors:
> + * Fam Zheng <address@hidden>
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> copy
> + * of this software and associated documentation files (the "Software"), to
> deal
> + * in the Software without restriction, including without limitation the
> rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +
> +#include <sys/epoll.h>
> +#include "qemu/main-loop.h"
> +
> +static bool g_poll_fds_changed(const GPollFD *fds_a, const guint nfds_a,
> + const GPollFD *fds_b, const guint nfds_b)
> +{
> + int i;
> +
> + if (nfds_a != nfds_b) {
> + return true;
> + }
> + if (!!fds_a != !!fds_b) {
> + return true;
> + }
> + for (i = 0; i < nfds_a; i++) {
> + if (fds_a[i].fd != fds_b[i].fd ||
> + fds_a[i].events != fds_b[i].events) {
> + return true;
> + }
> + }
> + return false;
> +}
> +
> +static inline int g_io_condition_from_epoll_events(int e)
> +{
> + return (e & EPOLLIN ? G_IO_IN : 0) |
> + (e & EPOLLOUT ? G_IO_OUT : 0) |
> + (e & EPOLLERR ? G_IO_ERR : 0) |
> + (e & EPOLLHUP ? G_IO_HUP : 0);
> +}
> +
> +static inline void epoll_event_from_g_poll_fd(struct epoll_event *event,
> + GPollFD *fd)
> +{
> + int e = fd->events;
> +
> + event->events = (e & G_IO_IN ? EPOLLIN : 0) |
> + (e & G_IO_OUT ? EPOLLOUT : 0) |
> + (e & G_IO_ERR ? EPOLLERR : 0) |
> + (e & G_IO_HUP ? EPOLLHUP : 0);
> + event->data.ptr = fd;
> +}
> +
> +static int epoll_prepare(int epollfd,
> + GPollFD *fds, guint nfds,
> + GPollFD **g_poll_fds,
> + guint *g_poll_nfds,
> + int **g_poll_fd_idx)
> +{
> + int i;
> +
> + GPollFD *pfds = NULL;
> + int npfds = 0;
> + int *idx = NULL;
> +
> + for (i = 0; i < nfds; i++) {
> + int r;
> + struct epoll_event event;
> + epoll_event_from_g_poll_fd(&event, &fds[i]);
> +
> + r = epoll_ctl(epollfd, EPOLL_CTL_ADD, fds[i].fd, &event);
> + if (r) {
> + /* Some fds may not support epoll, fall back and add them to
> + * ppoll_fds */
> + pfds = g_renew(GPollFD, pfds, npfds + 1);
> + pfds[npfds] = fds[i];
> + idx = g_renew(int, idx, npfds + 1);
> + idx[npfds] = i;
> + npfds++;
> + }
> + }
> +
> + g_free(*g_poll_fds);
> + *g_poll_fds = pfds;
> + *g_poll_nfds = npfds;
> + *g_poll_fd_idx = idx;
> +
> + return epollfd;
> +}
> +
> +int qemu_epoll(GPollFD *fds, guint nfds, int64_t timeout)
> +{
> + /* A copy of last fd array, used to skip epoll_prepare when nothing
> + * changed. */
> + static GPollFD *last_fds;
> + static guint last_nfds;
> + /* An array of fds that failed epoll_ctl and fall back to ppoll. Rare
> case
> + * too. */
> + static GPollFD *g_poll_fds;
> + static guint g_poll_nfds;
> + static int *g_poll_fd_idx;
> + static int epollfd = -1;
> + const int max_events = 40;
> + struct epoll_event events[max_events];
> + int ret = 0;
> + int r, i;
> +
> + if (!last_fds || g_poll_fds_changed(fds, nfds, last_fds, last_nfds)) {
> + if (epollfd >= 0) {
> + close(epollfd);
> + }
> + epollfd = epoll_create(1);
> + if (epollfd < 0) {
> + perror("epoll_create");
> + abort();
> + }
> + epollfd = epoll_prepare(epollfd, fds, nfds, &g_poll_fds,
> &g_poll_nfds,
> + &g_poll_fd_idx);
> + last_fds = g_memdup(fds, nfds * sizeof(GPollFD));
g_poll_fd_idx and last_fds are both leaked.
Fam
> + last_nfds = nfds;
> + }
> + if (g_poll_nfds) {
> + ret = g_poll(g_poll_fds, g_poll_nfds,
> qemu_timeout_ns_to_ms(timeout));
> + if (ret < 0) {
> + return ret;
> + }
> + /* Sync revents back to original fds */
> + for (i = 0; i < ret; i++) {
> + GPollFD *fd = &fds[g_poll_fd_idx[i]];
> + assert(fd->fd == g_poll_fds[i].fd);
> + fd->revents = g_poll_fds[i].revents;
> + }
> + }
> +
> + r = epoll_wait(epollfd, events, max_events,
> + qemu_timeout_ns_to_ms(timeout));
> + if (r < 0) {
> + return r;
> + }
> +
> + for (i = 0; i < r; i++) {
> + GPollFD *gpfd = events[i].data.ptr;
> + gpfd->revents = g_io_condition_from_epoll_events(events[i].events);
> + }
> +
> + ret += r;
> + return ret;
> +}