qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH v3 2/2] main-loop: Use epoll on Linux


From: Fam Zheng
Subject: [Qemu-devel] [PATCH v3 2/2] main-loop: Use epoll on Linux
Date: Mon, 27 Oct 2014 15:30:48 +0800

A new implementation for qemu_poll_ns based on epoll is introduced here
to address the slowness of g_poll and ppoll when the number of fds are
high.

On my laptop this would reduce the virtio-blk on top of null-aio
device's response time from 32 us to 29 us with few fds (~10), and 48 us
to 32 us with more fds (for example when virtio-serial is plugged and
~64 more io handlers are enabled).

Signed-off-by: Fam Zheng <address@hidden>
---
 Makefile.objs            |   1 +
 include/qemu/main-loop.h |   1 +
 include/qemu/timer.h     |  10 +++
 qemu-epoll.c             | 161 +++++++++++++++++++++++++++++++++++++++++++++++
 qemu-timer.c             |   4 +-
 tests/Makefile           |   2 +-
 6 files changed, 177 insertions(+), 2 deletions(-)
 create mode 100644 qemu-epoll.c

diff --git a/Makefile.objs b/Makefile.objs
index 18fd35c..db4a487 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -9,6 +9,7 @@ util-obj-y = util/ qobject/ qapi/ qapi-types.o qapi-visit.o 
qapi-event.o
 block-obj-y = async.o thread-pool.o
 block-obj-y += nbd.o block.o blockjob.o
 block-obj-y += main-loop.o iohandler.o qemu-timer.o
+block-obj-$(CONFIG_LINUX) += qemu-epoll.o
 block-obj-$(CONFIG_POSIX) += aio-posix.o
 block-obj-$(CONFIG_WIN32) += aio-win32.o
 block-obj-y += block/
diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index 62c68c0..d51bf4d 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -307,5 +307,6 @@ void qemu_iohandler_poll(GArray *pollfds, int rc);
 
 QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque);
 void qemu_bh_schedule_idle(QEMUBH *bh);
+int qemu_epoll(QEMUPollContext *ctx, GPollFD *fds, guint nfds, int64_t 
timeout);
 
 #endif
diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index be2a4a3..4097bc0 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -646,6 +646,16 @@ void timer_put(QEMUFile *f, QEMUTimer *ts);
 int qemu_timeout_ns_to_ms(int64_t ns);
 
 typedef struct {
+    /* A copy of last fd array, used to skip epoll_prepare when nothing
+     * changed. */
+    GPollFD *last_fds;
+    guint last_nfds;
+    /* An array of fds that failed epoll_ctl and fall back to ppoll. Rare case
+     * too.  */
+    GPollFD *g_poll_fds;
+    guint g_poll_nfds;
+    int *g_poll_fd_idx;
+    int epollfd;
 } QEMUPollContext;
 
 /**
diff --git a/qemu-epoll.c b/qemu-epoll.c
new file mode 100644
index 0000000..505a3be
--- /dev/null
+++ b/qemu-epoll.c
@@ -0,0 +1,161 @@
+/*
+ * QEMU Event Loop
+ *
+ * Copyright (c) 2014 Red Hat, Inc.
+ *
+ * Authors:
+ *      Fam Zheng <address@hidden>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <sys/epoll.h>
+#include "qemu/main-loop.h"
+
+static bool g_poll_fds_changed(const GPollFD *fds_a, const guint nfds_a,
+                               const GPollFD *fds_b, const guint nfds_b)
+{
+    int i;
+
+    if (nfds_a != nfds_b) {
+        return true;
+    }
+    if (!!fds_a != !!fds_b) {
+        return true;
+    }
+    for (i = 0; i < nfds_a; i++) {
+        if (fds_a[i].fd != fds_b[i].fd ||
+            fds_a[i].events != fds_b[i].events) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static inline int io_condition_from_epoll_events(int e)
+{
+    return (e & EPOLLIN  ? G_IO_IN : 0) |
+           (e & EPOLLOUT ? G_IO_OUT : 0) |
+           (e & EPOLLERR ? G_IO_ERR : 0) |
+           (e & EPOLLHUP ? G_IO_HUP : 0);
+}
+
+static inline void epoll_event_from_g_poll_fd(struct epoll_event *event,
+                                              GPollFD *fd)
+{
+    int e = fd->events;
+
+    event->events = (e & G_IO_IN  ? EPOLLIN : 0) |
+                    (e & G_IO_OUT ? EPOLLOUT : 0) |
+                    (e & G_IO_ERR ? EPOLLERR : 0) |
+                    (e & G_IO_HUP ? EPOLLHUP : 0);
+    event->data.ptr = fd;
+}
+
+static int epoll_prepare(int epollfd,
+                         GPollFD *fds, guint nfds,
+                         GPollFD **g_poll_fds,
+                         guint *g_poll_nfds,
+                         int **g_poll_fd_idx)
+{
+    int i;
+
+    GPollFD *pfds = NULL;
+    int npfds = 0;
+    int *idx = NULL;
+
+    for (i = 0; i < nfds; i++) {
+        int r;
+        struct epoll_event event;
+        epoll_event_from_g_poll_fd(&event, &fds[i]);
+
+        r = epoll_ctl(epollfd, EPOLL_CTL_ADD, fds[i].fd, &event);
+        if (r) {
+            /* Some fds may not support epoll, fall back and add them to
+             * ppoll_fds */
+            pfds = g_renew(GPollFD, pfds, npfds + 1);
+            pfds[npfds] = fds[i];
+            idx = g_renew(int, idx, npfds + 1);
+            idx[npfds] = i;
+            npfds++;
+        }
+    }
+
+    *g_poll_fds = pfds;
+    *g_poll_nfds = npfds;
+    *g_poll_fd_idx = idx;
+
+    return epollfd;
+}
+
+int qemu_epoll(QEMUPollContext *ctx, GPollFD *fds, guint nfds, int64_t timeout)
+{
+    const int max_events = 40;
+    struct epoll_event events[max_events];
+    int ret = 0;
+    int r, i;
+
+    if (!ctx->last_fds || g_poll_fds_changed(fds, nfds,
+                                           ctx->last_fds, ctx->last_nfds)) {
+        if (ctx->last_fds) {
+            close(ctx->epollfd);
+        }
+        ctx->epollfd = epoll_create(1);
+        if (ctx->epollfd < 0) {
+            perror("epoll_create");
+            abort();
+        }
+        g_free(ctx->g_poll_fds);
+        g_free(ctx->g_poll_fd_idx);
+        ctx->epollfd = epoll_prepare(ctx->epollfd, fds, nfds,
+                                     &ctx->g_poll_fds,
+                                     &ctx->g_poll_nfds,
+                                     &ctx->g_poll_fd_idx);
+        g_free(ctx->last_fds);
+        ctx->last_fds = g_memdup(fds, nfds * sizeof(GPollFD));
+        ctx->last_nfds = nfds;
+    }
+    if (ctx->g_poll_nfds) {
+        ret = g_poll(ctx->g_poll_fds, ctx->g_poll_nfds,
+                     qemu_timeout_ns_to_ms(timeout));
+        if (ret < 0) {
+            return ret;
+        }
+        /* Sync revents back to original fds */
+        for (i = 0; i < ret; i++) {
+            GPollFD *fd = &fds[ctx->g_poll_fd_idx[i]];
+            assert(fd->fd == ctx->g_poll_fds[i].fd);
+            fd->revents = ctx->g_poll_fds[i].revents;
+        }
+    }
+
+    r = epoll_wait(ctx->epollfd, events, max_events,
+                   qemu_timeout_ns_to_ms(timeout));
+    if (r < 0) {
+        return r;
+    }
+
+    for (i = 0; i < r; i++) {
+        GPollFD *gpfd = events[i].data.ptr;
+        gpfd->revents = io_condition_from_epoll_events(events[i].events);
+    }
+
+    ret += r;
+    return ret;
+}
diff --git a/qemu-timer.c b/qemu-timer.c
index fe78fdf..fbb9ded 100644
--- a/qemu-timer.c
+++ b/qemu-timer.c
@@ -310,7 +310,9 @@ int qemu_timeout_ns_to_ms(int64_t ns)
 int qemu_poll_ns(QEMUPollContext *ctx, GPollFD *fds,
                  guint nfds, int64_t timeout)
 {
-#ifdef CONFIG_PPOLL
+#ifdef CONFIG_LINUX
+    return qemu_epoll(ctx, fds, nfds, timeout);
+#elif CONFIG_PPOLL
     if (timeout < 0) {
         return ppoll((struct pollfd *)fds, nfds, NULL, NULL);
     } else {
diff --git a/tests/Makefile b/tests/Makefile
index 16f0e4c..575ffd2 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -348,7 +348,7 @@ tests/usb-hcd-ohci-test$(EXESUF): tests/usb-hcd-ohci-test.o 
$(libqos-usb-obj-y)
 tests/usb-hcd-uhci-test$(EXESUF): tests/usb-hcd-uhci-test.o $(libqos-usb-obj-y)
 tests/usb-hcd-ehci-test$(EXESUF): tests/usb-hcd-ehci-test.o $(libqos-usb-obj-y)
 tests/usb-hcd-xhci-test$(EXESUF): tests/usb-hcd-xhci-test.o $(libqos-usb-obj-y)
-tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-char.o 
qemu-timer.o $(qtest-obj-y)
+tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-char.o 
qemu-timer.o qemu-epoll.o $(qtest-obj-y)
 tests/qemu-iotests/socket_scm_helper$(EXESUF): 
tests/qemu-iotests/socket_scm_helper.o
 tests/test-qemu-opts$(EXESUF): tests/test-qemu-opts.o libqemuutil.a 
libqemustub.a
 
-- 
1.9.3




reply via email to

[Prev in Thread] Current Thread [Next in Thread]